Package Bio :: Package UniGene
[hide private]
[frames] | no frames]

Source Code for Package Bio.UniGene

  1  # Copyright 2006 by Sean Davis.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5  # 
  6  # $Id: __init__.py,v 1.12 2009-04-24 12:03:45 mdehoon Exp $ 
  7  # Sean Davis <sdavis2 at mail dot nih dot gov> 
  8  # National Cancer Institute 
  9  # National Institutes of Health 
 10  # Bethesda, MD, USA 
 11  # 
 12   
 13  """Parse Unigene flat file format files such as the Hs.data file. 
 14   
 15  Here is an overview of the flat file format that this parser deals with: 
 16   
 17     Line types/qualifiers:: 
 18   
 19         ID           UniGene cluster ID 
 20         TITLE        Title for the cluster 
 21         GENE         Gene symbol 
 22         CYTOBAND     Cytological band 
 23         EXPRESS      Tissues of origin for ESTs in cluster 
 24         RESTR_EXPR   Single tissue or development stage contributes 
 25                      more than half the total EST frequency for this gene. 
 26         GNM_TERMINUS genomic confirmation of presence of a 3' terminus; 
 27                      T if a non-templated polyA tail is found among 
 28                      a cluster's sequences; else 
 29                      I if templated As are found in genomic sequence or 
 30                      S if a canonical polyA signal is found on 
 31                        the genomic sequence 
 32         GENE_ID      Entrez gene identifier associated with at least one 
 33                      sequence in this cluster; 
 34                      to be used instead of LocusLink. 
 35         LOCUSLINK    LocusLink identifier associated with at least one 
 36                      sequence in this cluster; 
 37                      deprecated in favor of GENE_ID 
 38         HOMOL        Homology; 
 39         CHROMOSOME   Chromosome.  For plants, CHROMOSOME refers to mapping 
 40                      on the arabidopsis genome. 
 41         STS          STS 
 42              ACC=         GenBank/EMBL/DDBJ accession number of STS 
 43                           [optional field] 
 44              UNISTS=      identifier in NCBI's UNISTS database 
 45         TXMAP        Transcript map interval 
 46              MARKER=      Marker found on at least one sequence in this 
 47                           cluster 
 48              RHPANEL=     Radiation Hybrid panel used to place marker 
 49         PROTSIM      Protein Similarity data for the sequence with 
 50                      highest-scoring protein similarity in this cluster 
 51              ORG=         Organism 
 52              PROTGI=      Sequence GI of protein 
 53              PROTID=      Sequence ID of protein 
 54              PCT=         Percent alignment 
 55              ALN=         length of aligned region (aa) 
 56         SCOUNT       Number of sequences in the cluster 
 57         SEQUENCE     Sequence 
 58              ACC=         GenBank/EMBL/DDBJ accession number of sequence 
 59              NID=         Unique nucleotide sequence identifier (gi) 
 60              PID=         Unique protein sequence identifier (used for 
 61                           non-ESTs) 
 62              CLONE=       Clone identifier (used for ESTs only) 
 63              END=         End (5'/3') of clone insert read (used for 
 64                           ESTs only) 
 65              LID=         Library ID; see Hs.lib.info for library name 
 66                           and tissue 
 67              MGC=         5' CDS-completeness indicator; if present, the 
 68                           clone associated with this sequence is believed 
 69                           CDS-complete. A value greater than 511 is the gi 
 70                           of the CDS-complete mRNA matched by the EST, 
 71                           otherwise the value is an indicator of the 
 72                           reliability of the test indicating CDS 
 73                           completeness; higher values indicate more 
 74                           reliable CDS-completeness predictions. 
 75             SEQTYPE=      Description of the nucleotide sequence. 
 76                           Possible values are mRNA, EST and HTC. 
 77             TRACE=        The Trace ID of the EST sequence, as provided by 
 78                           NCBI Trace Archive 
 79  """ 
 80   
 81   
82 -class SequenceLine(object):
83 """Store the information for one SEQUENCE line from a Unigene file 84 85 Initialize with the text part of the SEQUENCE line, or nothing. 86 87 Attributes and descriptions (access as LOWER CASE): 88 89 - ACC= GenBank/EMBL/DDBJ accession number of sequence 90 - NID= Unique nucleotide sequence identifier (gi) 91 - PID= Unique protein sequence identifier (used for non-ESTs) 92 - CLONE= Clone identifier (used for ESTs only) 93 - END= End (5'/3') of clone insert read (used for ESTs only) 94 - LID= Library ID; see Hs.lib.info for library name and tissue 95 - MGC= 5' CDS-completeness indicator; if present, 96 the clone associated with this sequence 97 is believed CDS-complete. A value greater than 511 98 is the gi of the CDS-complete mRNA matched by the EST, 99 otherwise the value is an indicator of the reliability 100 of the test indicating CDS completeness; 101 higher values indicate more reliable CDS-completeness 102 predictions. 103 - SEQTYPE= Description of the nucleotide sequence. Possible values 104 are mRNA, EST and HTC. 105 - TRACE= The Trace ID of the EST sequence, as provided by NCBI 106 Trace Archive 107 """ 108
109 - def __init__(self, text=None):
110 self.acc = '' 111 self.nid = '' 112 self.lid = '' 113 self.pid = '' 114 self.clone = '' 115 self.image = '' 116 self.is_image = False 117 self.end = '' 118 self.mgc = '' 119 self.seqtype = '' 120 self.trace = '' 121 if text is not None: 122 self.text = text 123 self._init_from_text(text)
124
125 - def _init_from_text(self, text):
126 parts = text.split('; ') 127 for part in parts: 128 key, val = part.split("=") 129 if key == 'CLONE': 130 if val[:5] == 'IMAGE': 131 self.is_image = True 132 self.image = val[6:] 133 setattr(self, key.lower(), val)
134
135 - def __repr__(self):
136 return self.text
137 138
139 -class ProtsimLine(object):
140 """Store the information for one PROTSIM line from a Unigene file 141 142 Initialize with the text part of the PROTSIM line, or nothing. 143 144 Attributes and descriptions (access as LOWER CASE) 145 ORG= Organism 146 PROTGI= Sequence GI of protein 147 PROTID= Sequence ID of protein 148 PCT= Percent alignment 149 ALN= length of aligned region (aa) 150 """ 151
152 - def __init__(self, text=None):
153 self.org = '' 154 self.protgi = '' 155 self.protid = '' 156 self.pct = '' 157 self.aln = '' 158 if text is not None: 159 self.text = text 160 self._init_from_text(text)
161
162 - def _init_from_text(self, text):
163 parts = text.split('; ') 164 165 for part in parts: 166 key, val = part.split("=") 167 setattr(self, key.lower(), val)
168
169 - def __repr__(self):
170 return self.text
171 172
173 -class STSLine(object):
174 """Store the information for one STS line from a Unigene file 175 176 Initialize with the text part of the STS line, or nothing. 177 178 Attributes and descriptions (access as LOWER CASE) 179 180 ACC= GenBank/EMBL/DDBJ accession number of STS [optional field] 181 UNISTS= identifier in NCBI's UNISTS database 182 """ 183
184 - def __init__(self, text=None):
185 self.acc = '' 186 self.unists = '' 187 if text is not None: 188 self.text = text 189 self._init_from_text(text)
190
191 - def _init_from_text(self, text):
192 parts = text.split(' ') 193 194 for part in parts: 195 key, val = part.split("=") 196 setattr(self, key.lower(), val)
197
198 - def __repr__(self):
199 return self.text
200 201
202 -class Record(object):
203 """Store a Unigene record 204 205 Here is what is stored:: 206 207 self.ID = '' # ID line 208 self.species = '' # Hs, Bt, etc. 209 self.title = '' # TITLE line 210 self.symbol = '' # GENE line 211 self.cytoband = '' # CYTOBAND line 212 self.express = [] # EXPRESS line, parsed on ';' 213 # Will be an array of strings 214 self.restr_expr = '' # RESTR_EXPR line 215 self.gnm_terminus = '' # GNM_TERMINUS line 216 self.gene_id = '' # GENE_ID line 217 self.locuslink = '' # LOCUSLINK line 218 self.homol = '' # HOMOL line 219 self.chromosome = '' # CHROMOSOME line 220 self.protsim = [] # PROTSIM entries, array of Protsims 221 # Type ProtsimLine 222 self.sequence = [] # SEQUENCE entries, array of Sequence entries 223 # Type SequenceLine 224 self.sts = [] # STS entries, array of STS entries 225 # Type STSLine 226 self.txmap = [] # TXMAP entries, array of TXMap entries 227 """ 228
229 - def __init__(self):
230 self.ID = '' # ID line 231 self.species = '' # Hs, Bt, etc. 232 self.title = '' # TITLE line 233 self.symbol = '' # GENE line 234 self.cytoband = '' # CYTOBAND line 235 self.express = [] # EXPRESS line, parsed on ';' 236 self.restr_expr = '' # RESTR_EXPR line 237 self.gnm_terminus = '' # GNM_TERMINUS line 238 self.gene_id = '' # GENE_ID line 239 self.locuslink = '' # LOCUSLINK line 240 self.homol = '' # HOMOL line 241 self.chromosome = '' # CHROMOSOME line 242 self.protsim = [] # PROTSIM entries, array of Protsims 243 self.sequence = [] # SEQUENCE entries, array of Sequence entries 244 self.sts = [] # STS entries, array of STS entries 245 self.txmap = [] # TXMAP entries, array of TXMap entries
246
247 - def __repr__(self):
248 return "<%s> %s %s\n%s" % (self.__class__.__name__, 249 self.ID, self.symbol, self.title)
250 251
252 -def parse(handle):
253 while True: 254 record = _read(handle) 255 if not record: 256 return 257 yield record
258 259
260 -def read(handle):
261 record = _read(handle) 262 if not record: 263 raise ValueError("No SwissProt record found") 264 # We should have reached the end of the record by now 265 remainder = handle.read() 266 if remainder: 267 raise ValueError("More than one SwissProt record found") 268 return record
269 270 271 # Everything below is private 272 273
274 -def _read(handle):
275 UG_INDENT = 12 276 record = None 277 for line in handle: 278 tag, value = line[:UG_INDENT].rstrip(), line[UG_INDENT:].rstrip() 279 line = line.rstrip() 280 if tag == "ID": 281 record = Record() 282 record.ID = value 283 record.species = record.ID.split('.')[0] 284 elif tag == "TITLE": 285 record.title = value 286 elif tag == "GENE": 287 record.symbol = value 288 elif tag == "GENE_ID": 289 record.gene_id = value 290 elif tag == "LOCUSLINK": 291 record.locuslink = value 292 elif tag == "HOMOL": 293 if value == "YES": 294 record.homol = True 295 elif value == "NO": 296 record.homol = True 297 else: 298 raise ValueError("Cannot parse HOMOL line %s" % line) 299 elif tag == "EXPRESS": 300 record.express = [word.strip() for word in value.split("|")] 301 elif tag == "RESTR_EXPR": 302 record.restr_expr = [word.strip() for word in value.split("|")] 303 elif tag == "CHROMOSOME": 304 record.chromosome = value 305 elif tag == "CYTOBAND": 306 record.cytoband = value 307 elif tag == "PROTSIM": 308 protsim = ProtsimLine(value) 309 record.protsim.append(protsim) 310 elif tag == "SCOUNT": 311 scount = int(value) 312 elif tag == "SEQUENCE": 313 sequence = SequenceLine(value) 314 record.sequence.append(sequence) 315 elif tag == "STS": 316 sts = STSLine(value) 317 record.sts.append(sts) 318 elif tag == '//': 319 if len(record.sequence) != scount: 320 raise ValueError("The number of sequences specified in the record" 321 " (%d) does not agree with the number of sequences found (%d)" % (scount, len(record.sequence))) 322 return record 323 else: 324 raise ValueError("Unknown tag %s" % tag) 325 if record: 326 raise ValueError("Unexpected end of stream.")
327