Package Bio :: Package UniProt :: Module GOA
[hide private]
[frames] | no frames]

Source Code for Module Bio.UniProt.GOA

  1  #!/usr/bin/env python 
  2  # Copyright 2013, 2016 by Iddo Friedberg idoerg@gmail.com 
  3  # All rights reserved. 
  4  # This code is part of the Biopython distribution and governed by its 
  5  # license.  Please see the LICENSE file that should have been included 
  6  # as part of this package. 
  7  """Parsers for the GAF, GPA and GPI formats from UniProt-GOA. 
  8   
  9  Uniprot-GOA README + GAF format description: 
 10  ftp://ftp.ebi.ac.uk/pub/databases/GO/goa/UNIPROT/README 
 11   
 12  GAF formats: 
 13  http://www.geneontology.org/GO.format.annotation.shtml 
 14  gp_association (GPA format) README: 
 15  ftp://ftp.ebi.ac.uk/pub/databases/GO/goa/UNIPROT/gp_association_readme 
 16   
 17  gp_information (GPI format) README: 
 18  ftp://ftp.ebi.ac.uk/pub/databases/GO/goa/UNIPROT/gp_information_readme 
 19  """ 
 20   
 21  from __future__ import print_function 
 22   
 23  import copy 
 24   
 25  from Bio._py3k import zip 
 26   
 27  # GAF: GO Annotation Format 
 28  # 
 29  # GAF version 2.0 
 30   
 31  GAF20FIELDS = ['DB', 
 32                 'DB_Object_ID', 
 33                 'DB_Object_Symbol', 
 34                 'Qualifier', 
 35                 'GO_ID', 
 36                 'DB:Reference', 
 37                 'Evidence', 
 38                 'With', 
 39                 'Aspect', 
 40                 'DB_Object_Name', 
 41                 'Synonym', 
 42                 'DB_Object_Type', 
 43                 'Taxon_ID', 
 44                 'Date', 
 45                 'Assigned_By', 
 46                 'Annotation_Extension', 
 47                 'Gene_Product_Form_ID'] 
 48   
 49  # GAF version 1.0 
 50  GAF10FIELDS = ['DB', 
 51                 'DB_Object_ID', 
 52                 'DB_Object_Symbol', 
 53                 'Qualifier', 
 54                 'GO_ID', 
 55                 'DB:Reference', 
 56                 'Evidence', 
 57                 'With', 
 58                 'Aspect', 
 59                 'DB_Object_Name', 
 60                 'Synonym', 
 61                 'DB_Object_Type', 
 62                 'Taxon_ID', 
 63                 'Date', 
 64                 'Assigned_By'] 
 65   
 66  # GPA version 1.0 
 67  GPA10FIELDS = [ 
 68      'DB', 
 69      'DB_Object_ID', 
 70      'Qualifier', 
 71      'GO_ID', 
 72      'DB:Reference', 
 73      'Evidence code', 
 74      'With', 
 75      'Interacting_taxon_ID', 
 76      'Date', 
 77      'Assigned_by', 
 78      'Annotation_Extension', 
 79      'Spliceform_ID'] 
 80   
 81  # GPA version 1.1 
 82  GPA11FIELDS = [ 
 83      'DB', 
 84      'DB_Object_ID', 
 85      'Qualifier', 
 86      'GO_ID', 
 87      'DB:Reference', 
 88      'ECO_Evidence_code', 
 89      'With', 
 90      'Interacting_taxon_ID', 
 91      'Date', 
 92      'Assigned_by', 
 93      'Annotation Extension', 
 94      'Annotation_Properties'] 
 95   
 96  # GPI version 1.0 
 97  GPI10FIELDS = [ 
 98      'DB', 
 99      'DB_subset', 
100      'DB_Object_ID', 
101      'DB_Object_Symbol', 
102      'DB_Object_Name', 
103      'DB_Object_Synonym', 
104      'DB_Object_Type', 
105      'Taxon', 
106      'Annotation_Target_Set', 
107      'Annotation_Completed', 
108      'Parent_Object_ID'] 
109   
110  # GPI version 1.1 
111  GPI11FIELDS = [ 
112      'DB_Object_ID', 
113      'DB_Object_Symbol', 
114      'DB_Object_Name', 
115      'DB_Object_Synonym', 
116      'DB_Object_Type', 
117      'Taxon', 
118      'Parent_Object_ID', 
119      'DB_Xref', 
120      'Gene_Product_Properties', 
121      'Annotation_Target_Set', 
122      'GO_Annotation_Complete'] 
123   
124   
125 -def _gpi10iterator(handle):
126 """Read GPI 1.0 format files (PRIVATE). 127 128 This iterator is used to read a gp_information.goa_uniprot 129 file which is in the GPI 1.0 format. 130 """ 131 for inline in handle: 132 if inline[0] == '!': 133 continue 134 inrec = inline.rstrip('\n').split('\t') 135 if len(inrec) == 1: 136 continue 137 inrec[5] = inrec[5].split('|') # DB_Object_Synonym(s) 138 inrec[8] = inrec[8].split('|') # Annotation_Target_Set 139 yield dict(zip(GPI10FIELDS, inrec))
140 141
142 -def _gpi11iterator(handle):
143 """Read GPI 1.0 format files (PRIVATE). 144 145 This iterator is used to read a gp_information.goa_uniprot 146 file which is in the GPI 1.0 format. 147 """ 148 for inline in handle: 149 if inline[0] == '!': 150 continue 151 inrec = inline.rstrip('\n').split('\t') 152 if len(inrec) == 1: 153 continue 154 inrec[2] = inrec[2].split('|') # DB_Object_Name 155 inrec[3] = inrec[3].split('|') # DB_Object_Synonym(s) 156 inrec[7] = inrec[7].split('|') # DB_Xref(s) 157 inrec[8] = inrec[8].split('|') # Properties 158 yield dict(zip(GPI11FIELDS, inrec))
159 160
161 -def gpi_iterator(handle):
162 """Read GPI format files. 163 164 This function should be called to read a 165 gp_information.goa_uniprot file. At the moment, there is 166 only one format, but this may change, so 167 this function is a placeholder a future wrapper. 168 """ 169 inline = handle.readline() 170 if inline.strip() == '!gpi-version: 1.1': 171 # sys.stderr.write("gpi 1.1\n") 172 return _gpi11iterator(handle) 173 elif inline.strip() == '!gpi-version: 1.0': 174 # sys.stderr.write("gpi 1.0\n") 175 return _gpi10iterator(handle) 176 elif inline.strip() == '!gpi-version: 2.1': 177 # sys.stderr.write("gpi 2.1\n") 178 # return _gpi20iterator(handle) 179 raise NotImplementedError("Sorry, parsing GPI version 2 not implemented yet.") 180 else: 181 raise ValueError('Unknown GPI version {0}\n'.format(inline))
182 183
184 -def _gpa10iterator(handle):
185 """Read GPA 1.0 format files (PRIVATE). 186 187 This iterator is used to read a gp_association.* 188 file which is in the GPA 1.0 format. Do not call directly. Rather, 189 use the gpaiterator function. 190 """ 191 for inline in handle: 192 if inline[0] == '!': 193 continue 194 inrec = inline.rstrip('\n').split('\t') 195 if len(inrec) == 1: 196 continue 197 inrec[2] = inrec[2].split('|') # Qualifier 198 inrec[4] = inrec[4].split('|') # DB:Reference(s) 199 inrec[6] = inrec[6].split('|') # With 200 inrec[10] = inrec[10].split('|') # Annotation extension 201 yield dict(zip(GPA10FIELDS, inrec))
202 203
204 -def _gpa11iterator(handle):
205 """Read GPA 1.1 format files (PRIVATE). 206 207 This iterator is used to read a gp_association.goa_uniprot 208 file which is in the GPA 1.1 format. Do not call directly. Rather 209 use the gpa_iterator function 210 """ 211 for inline in handle: 212 if inline[0] == '!': 213 continue 214 inrec = inline.rstrip('\n').split('\t') 215 if len(inrec) == 1: 216 continue 217 inrec[2] = inrec[2].split('|') # Qualifier 218 inrec[4] = inrec[4].split('|') # DB:Reference(s) 219 inrec[6] = inrec[6].split('|') # With 220 inrec[10] = inrec[10].split('|') # Annotation extension 221 yield dict(zip(GPA11FIELDS, inrec))
222 223
224 -def gpa_iterator(handle):
225 """Wrapper function: read GPA format files. 226 227 This function should be called to read a 228 gene_association.goa_uniprot file. Reads the first record and 229 returns a gpa 1.1 or a gpa 1.0 iterator as needed 230 """ 231 inline = handle.readline() 232 if inline.strip() == '!gpa-version: 1.1': 233 # sys.stderr.write("gpa 1.1\n") 234 return _gpa11iterator(handle) 235 elif inline.strip() == '!gpa-version: 1.0': 236 # sys.stderr.write("gpa 1.0\n") 237 return _gpa10iterator(handle) 238 else: 239 raise ValueError('Unknown GPA version {0}\n'.format(inline))
240 241
242 -def _gaf20iterator(handle):
243 for inline in handle: 244 if inline[0] == '!': 245 continue 246 inrec = inline.rstrip('\n').split('\t') 247 if len(inrec) == 1: 248 continue 249 inrec[3] = inrec[3].split('|') # Qualifier 250 inrec[5] = inrec[5].split('|') # DB:reference(s) 251 inrec[7] = inrec[7].split('|') # With || From 252 inrec[10] = inrec[10].split('|') # Synonym 253 inrec[12] = inrec[12].split('|') # Taxon 254 yield dict(zip(GAF20FIELDS, inrec))
255 256
257 -def _gaf10iterator(handle):
258 for inline in handle: 259 if inline[0] == '!': 260 continue 261 inrec = inline.rstrip('\n').split('\t') 262 if len(inrec) == 1: 263 continue 264 inrec[3] = inrec[3].split('|') # Qualifier 265 inrec[5] = inrec[5].split('|') # DB:reference(s) 266 inrec[7] = inrec[7].split('|') # With || From 267 inrec[10] = inrec[10].split('|') # Synonym 268 inrec[12] = inrec[12].split('|') # Taxon 269 yield dict(zip(GAF10FIELDS, inrec))
270 271
272 -def _gaf10byproteiniterator(handle):
273 cur_id = None 274 id_rec_list = [] 275 for inline in handle: 276 if inline[0] == '!': 277 continue 278 inrec = inline.rstrip('\n').split('\t') 279 if len(inrec) == 1: 280 continue 281 inrec[3] = inrec[3].split('|') # Qualifier 282 inrec[5] = inrec[5].split('|') # DB:reference(s) 283 inrec[7] = inrec[7].split('|') # With || From 284 inrec[10] = inrec[10].split('|') # Synonym 285 inrec[12] = inrec[12].split('|') # Taxon 286 cur_rec = dict(zip(GAF10FIELDS, inrec)) 287 if cur_rec['DB_Object_ID'] != cur_id and cur_id: 288 ret_list = copy.copy(id_rec_list) 289 id_rec_list = [cur_rec] 290 cur_id = cur_rec['DB_Object_ID'] 291 yield ret_list 292 else: 293 cur_id = cur_rec['DB_Object_ID'] 294 id_rec_list.append(cur_rec)
295 296
297 -def _gaf20byproteiniterator(handle):
298 cur_id = None 299 id_rec_list = [] 300 for inline in handle: 301 if inline[0] == '!': 302 continue 303 inrec = inline.rstrip('\n').split('\t') 304 if len(inrec) == 1: 305 continue 306 inrec[3] = inrec[3].split('|') # Qualifier 307 inrec[5] = inrec[5].split('|') # DB:reference(s) 308 inrec[7] = inrec[7].split('|') # With || From 309 inrec[10] = inrec[10].split('|') # Synonym 310 inrec[12] = inrec[12].split('|') # Taxon 311 cur_rec = dict(zip(GAF20FIELDS, inrec)) 312 if cur_rec['DB_Object_ID'] != cur_id and cur_id: 313 ret_list = copy.copy(id_rec_list) 314 id_rec_list = [cur_rec] 315 cur_id = cur_rec['DB_Object_ID'] 316 yield ret_list 317 else: 318 cur_id = cur_rec['DB_Object_ID'] 319 id_rec_list.append(cur_rec)
320 321
322 -def gafbyproteiniterator(handle):
323 """Iterates over records in a gene association file. 324 325 Returns a list of all consecutive records with the same DB_Object_ID 326 This function should be called to read a 327 gene_association.goa_uniprot file. Reads the first record and 328 returns a gaf 2.0 or a gaf 1.0 iterator as needed 329 2016-04-09: added GAF 2.1 iterator & fixed bug in iterator assignment 330 In the meantime GAF 2.1 uses the GAF 2.0 iterator 331 """ 332 inline = handle.readline() 333 if inline.strip() == '!gaf-version: 2.0': 334 # sys.stderr.write("gaf 2.0\n") 335 return _gaf20byproteiniterator(handle) 336 elif inline.strip() == '!gaf-version: 1.0': 337 # sys.stderr.write("gaf 1.0\n") 338 return _gaf10byproteiniterator(handle) 339 elif inline.strip() == '!gaf-version: 2.1': 340 # Handle GAF 2.1 as GAF 2.0 for now TODO: fix 341 # sys.stderr.write("gaf 2.1\n") 342 return _gaf20byproteiniterator(handle) 343 else: 344 raise ValueError('Unknown GAF version {0}\n'.format(inline))
345 346
347 -def gafiterator(handle):
348 """Iterate over a GAF 1.0 or 2.0 file. 349 350 This function should be called to read a 351 gene_association.goa_uniprot file. Reads the first record and 352 returns a gaf 2.0 or a gaf 1.0 iterator as needed 353 354 Example: open, read, interat and filter results. 355 356 Original data file has been trimed to ~600 rows. 357 358 Original source ftp://ftp.ebi.ac.uk/pub/databases/GO/goa/YEAST/goa_yeast.gaf.gz 359 360 >>> from Bio.UniProt.GOA import gafiterator, record_has 361 >>> Evidence = {'Evidence': set(['ND'])} 362 >>> Synonym = {'Synonym': set(['YA19A_YEAST', 'YAL019W-A'])} 363 >>> Taxon_ID = {'Taxon_ID': set(['taxon:559292'])} 364 >>> with open('UniProt/goa_yeast.gaf', 'r') as handle: 365 ... for rec in gafiterator(handle): 366 ... if record_has(rec, Taxon_ID) and record_has(rec, Evidence) and record_has(rec, Synonym): 367 ... for key in ('DB_Object_Name', 'Evidence', 'Synonym', 'Taxon_ID'): 368 ... print(rec[key]) 369 ... 370 Putative uncharacterized protein YAL019W-A 371 ND 372 ['YA19A_YEAST', 'YAL019W-A'] 373 ['taxon:559292'] 374 Putative uncharacterized protein YAL019W-A 375 ND 376 ['YA19A_YEAST', 'YAL019W-A'] 377 ['taxon:559292'] 378 Putative uncharacterized protein YAL019W-A 379 ND 380 ['YA19A_YEAST', 'YAL019W-A'] 381 ['taxon:559292'] 382 383 """ 384 inline = handle.readline() 385 if inline.strip() == '!gaf-version: 2.0': 386 # sys.stderr.write("gaf 2.0\n") 387 return _gaf20iterator(handle) 388 elif inline.strip() == '!gaf-version: 2.1': 389 # sys.stderr.write("gaf 2.1\n") 390 # Handle GAF 2.1 as GAF 2.0 for now. TODO: fix 391 return _gaf20iterator(handle) 392 elif inline.strip() == '!gaf-version: 1.0': 393 # sys.stderr.write("gaf 1.0\n") 394 return _gaf10iterator(handle) 395 else: 396 raise ValueError('Unknown GAF version {0}\n'.format(inline))
397 398
399 -def writerec(outrec, handle, fields=GAF20FIELDS):
400 """Write a single UniProt-GOA record to an output stream. 401 402 Caller should know the format version. Default: gaf-2.0 403 If header has a value, then it is assumed this is the first record, 404 a header is written. 405 """ 406 outstr = '' 407 for field in fields[:-1]: 408 if isinstance(outrec[field], list): 409 for subfield in outrec[field]: 410 outstr += subfield + '|' 411 outstr = outstr[:-1] + '\t' 412 else: 413 outstr += outrec[field] + '\t' 414 outstr += outrec[fields[-1]] + '\n' 415 handle.write("%s" % outstr)
416 417
418 -def writebyproteinrec(outprotrec, handle, fields=GAF20FIELDS):
419 """Write a list of GAF records to an output stream. 420 421 Caller should know the format version. Default: gaf-2.0 422 If header has a value, then it is assumed this is the first record, 423 a header is written. Typically the list is the one read by fafbyproteinrec, which 424 contains all consecutive lines with the same DB_Object_ID 425 """ 426 for outrec in outprotrec: 427 writerec(outrec, handle, fields=fields)
428 429
430 -def record_has(inrec, fieldvals):
431 """Accepts a record, and a dictionary of field values. 432 433 The format is {'field_name': set([val1, val2])}. 434 If any field in the record has a matching value, the function returns 435 True. Otherwise, returns False. 436 """ 437 retval = False 438 for field in fieldvals: 439 if isinstance(inrec[field], str): 440 set1 = {inrec[field]} 441 else: 442 set1 = set(inrec[field]) 443 if (set1 & fieldvals[field]): 444 retval = True 445 break 446 return retval
447 448 449 if __name__ == "__main__": 450 from Bio._utils import run_doctest 451 run_doctest(verbose=0) 452