1
2
3
4
5
6
7
8
9
10
11
12
13 """Parse Unigene flat file format files such as the Hs.data file.
14
15 Here is an overview of the flat file format that this parser deals with:
16 Line types/qualifiers:
17
18 ID UniGene cluster ID
19 TITLE Title for the cluster
20 GENE Gene symbol
21 CYTOBAND Cytological band
22 EXPRESS Tissues of origin for ESTs in cluster
23 RESTR_EXPR Single tissue or development stage contributes
24 more than half the total EST frequency for this gene.
25 GNM_TERMINUS genomic confirmation of presence of a 3' terminus;
26 T if a non-templated polyA tail is found among
27 a cluster's sequences; else
28 I if templated As are found in genomic sequence or
29 S if a canonical polyA signal is found on
30 the genomic sequence
31 GENE_ID Entrez gene identifier associated with at least one
32 sequence in this cluster;
33 to be used instead of LocusLink.
34 LOCUSLINK LocusLink identifier associated with at least one
35 sequence in this cluster;
36 deprecated in favor of GENE_ID
37 HOMOL Homology;
38 CHROMOSOME Chromosome. For plants, CHROMOSOME refers to mapping
39 on the arabidopsis genome.
40 STS STS
41 ACC= GenBank/EMBL/DDBJ accession number of STS
42 [optional field]
43 UNISTS= identifier in NCBI's UNISTS database
44 TXMAP Transcript map interval
45 MARKER= Marker found on at least one sequence in this
46 cluster
47 RHPANEL= Radiation Hybrid panel used to place marker
48 PROTSIM Protein Similarity data for the sequence with
49 highest-scoring protein similarity in this cluster
50 ORG= Organism
51 PROTGI= Sequence GI of protein
52 PROTID= Sequence ID of protein
53 PCT= Percent alignment
54 ALN= length of aligned region (aa)
55 SCOUNT Number of sequences in the cluster
56 SEQUENCE Sequence
57 ACC= GenBank/EMBL/DDBJ accession number of sequence
58 NID= Unique nucleotide sequence identifier (gi)
59 PID= Unique protein sequence identifier (used for
60 non-ESTs)
61 CLONE= Clone identifier (used for ESTs only)
62 END= End (5'/3') of clone insert read (used for
63 ESTs only)
64 LID= Library ID; see Hs.lib.info for library name
65 and tissue
66 MGC= 5' CDS-completeness indicator; if present, the
67 clone associated with this sequence is believed
68 CDS-complete. A value greater than 511 is the gi
69 of the CDS-complete mRNA matched by the EST,
70 otherwise the value is an indicator of the
71 reliability of the test indicating CDS
72 completeness; higher values indicate more
73 reliable CDS-completeness predictions.
74 SEQTYPE= Description of the nucleotide sequence.
75 Possible values are mRNA, EST and HTC.
76 TRACE= The Trace ID of the EST sequence, as provided by
77 NCBI Trace Archive
78 """
79
80
82 """Store the information for one SEQUENCE line from a Unigene file
83
84 Initialize with the text part of the SEQUENCE line, or nothing.
85
86 Attributes and descriptions (access as LOWER CASE)
87 ACC= GenBank/EMBL/DDBJ accession number of sequence
88 NID= Unique nucleotide sequence identifier (gi)
89 PID= Unique protein sequence identifier (used for non-ESTs)
90 CLONE= Clone identifier (used for ESTs only)
91 END= End (5'/3') of clone insert read (used for ESTs only)
92 LID= Library ID; see Hs.lib.info for library name and tissue
93 MGC= 5' CDS-completeness indicator; if present,
94 the clone associated with this sequence
95 is believed CDS-complete. A value greater than 511
96 is the gi of the CDS-complete mRNA matched by the EST,
97 otherwise the value is an indicator of the reliability
98 of the test indicating CDS completeness;
99 higher values indicate more reliable CDS-completeness
100 predictions.
101 SEQTYPE= Description of the nucleotide sequence. Possible values
102 are mRNA, EST and HTC.
103 TRACE= The Trace ID of the EST sequence, as provided by NCBI
104 Trace Archive
105 """
106
108 self.acc = ''
109 self.nid = ''
110 self.lid = ''
111 self.pid = ''
112 self.clone = ''
113 self.image = ''
114 self.is_image = False
115 self.end = ''
116 self.mgc = ''
117 self.seqtype = ''
118 self.trace = ''
119 if text is not None:
120 self.text=text
121 self._init_from_text(text)
122
123 - def _init_from_text(self,text):
124 parts = text.split('; ')
125 for part in parts:
126 key, val = part.split("=")
127 if key=='CLONE':
128 if val[:5]=='IMAGE':
129 self.is_image=True
130 self.image = val[6:]
131 setattr(self,key.lower(),val)
132
135
136
138 """Store the information for one PROTSIM line from a Unigene file
139
140 Initialize with the text part of the PROTSIM line, or nothing.
141
142 Attributes and descriptions (access as LOWER CASE)
143 ORG= Organism
144 PROTGI= Sequence GI of protein
145 PROTID= Sequence ID of protein
146 PCT= Percent alignment
147 ALN= length of aligned region (aa)
148 """
149
151 self.org = ''
152 self.protgi = ''
153 self.protid = ''
154 self.pct = ''
155 self.aln = ''
156 if text is not None:
157 self.text=text
158 self._init_from_text(text)
159
160 - def _init_from_text(self,text):
161 parts = text.split('; ')
162
163 for part in parts:
164 key, val = part.split("=")
165 setattr(self,key.lower(),val)
166
169
170
172 """Store the information for one STS line from a Unigene file
173
174 Initialize with the text part of the STS line, or nothing.
175
176 Attributes and descriptions (access as LOWER CASE)
177
178 ACC= GenBank/EMBL/DDBJ accession number of STS [optional field]
179 UNISTS= identifier in NCBI's UNISTS database
180 """
181
183 self.acc = ''
184 self.unists = ''
185 if text is not None:
186 self.text=text
187 self._init_from_text(text)
188
189 - def _init_from_text(self,text):
190 parts = text.split(' ')
191
192 for part in parts:
193 key, val = part.split("=")
194 setattr(self,key.lower(),val)
195
198
199
201 """Store a Unigene record
202
203 Here is what is stored:
204
205 self.ID = '' # ID line
206 self.species = '' # Hs, Bt, etc.
207 self.title = '' # TITLE line
208 self.symbol = '' # GENE line
209 self.cytoband = '' # CYTOBAND line
210 self.express = [] # EXPRESS line, parsed on ';'
211 # Will be an array of strings
212 self.restr_expr = '' # RESTR_EXPR line
213 self.gnm_terminus = '' # GNM_TERMINUS line
214 self.gene_id = '' # GENE_ID line
215 self.locuslink = '' # LOCUSLINK line
216 self.homol = '' # HOMOL line
217 self.chromosome = '' # CHROMOSOME line
218 self.protsim = [] # PROTSIM entries, array of Protsims
219 # Type ProtsimLine
220 self.sequence = [] # SEQUENCE entries, array of Sequence entries
221 # Type SequenceLine
222 self.sts = [] # STS entries, array of STS entries
223 # Type STSLine
224 self.txmap = [] # TXMAP entries, array of TXMap entries
225 """
226
228 self.ID = ''
229 self.species = ''
230 self.title = ''
231 self.symbol = ''
232 self.cytoband = ''
233 self.express = []
234 self.restr_expr = ''
235 self.gnm_terminus = ''
236 self.gene_id = ''
237 self.locuslink = ''
238 self.homol = ''
239 self.chromosome = ''
240 self.protsim = []
241 self.sequence = []
242 self.sts = []
243 self.txmap = []
244
246 return "<%s> %s %s\n%s" % (self.__class__.__name__,
247 self.ID, self.symbol, self.title)
248
249
251 while True:
252 record = _read(handle)
253 if not record:
254 return
255 yield record
256
257
259 record = _read(handle)
260 if not record:
261 raise ValueError("No SwissProt record found")
262
263 remainder = handle.read()
264 if remainder:
265 raise ValueError("More than one SwissProt record found")
266 return record
267
268
269
270
271
273 UG_INDENT = 12
274 record = None
275 for line in handle:
276 tag, value = line[:UG_INDENT].rstrip(), line[UG_INDENT:].rstrip()
277 line = line.rstrip()
278 if tag=="ID":
279 record = Record()
280 record.ID = value
281 record.species = record.ID.split('.')[0]
282 elif tag=="TITLE":
283 record.title = value
284 elif tag=="GENE":
285 record.symbol = value
286 elif tag=="GENE_ID":
287 record.gene_id = value
288 elif tag=="LOCUSLINK":
289 record.locuslink = value
290 elif tag=="HOMOL":
291 if value=="YES":
292 record.homol = True
293 elif value=="NO":
294 record.homol = True
295 else:
296 raise ValueError("Cannot parse HOMOL line %s" % line)
297 elif tag=="EXPRESS":
298 record.express = [word.strip() for word in value.split("|")]
299 elif tag=="RESTR_EXPR":
300 record.restr_expr = [word.strip() for word in value.split("|")]
301 elif tag=="CHROMOSOME":
302 record.chromosome = value
303 elif tag=="CYTOBAND":
304 record.cytoband = value
305 elif tag=="PROTSIM":
306 protsim = ProtsimLine(value)
307 record.protsim.append(protsim)
308 elif tag=="SCOUNT":
309 scount = int(value)
310 elif tag=="SEQUENCE":
311 sequence = SequenceLine(value)
312 record.sequence.append(sequence)
313 elif tag=="STS":
314 sts = STSLine(value)
315 record.sts.append(sts)
316 elif tag=='//':
317 if len(record.sequence)!=scount:
318 raise ValueError("The number of sequences specified in the record"
319 " (%d) does not agree with the number of sequences found (%d)" % (scount, len(record.sequence)))
320 return record
321 else:
322 raise ValueError("Unknown tag %s" % tag)
323 if record:
324 raise ValueError("Unexpected end of stream.")
325