Package Bio :: Package UniProt :: Module GOA
[hide private]
[frames] | no frames]

Source Code for Module Bio.UniProt.GOA

  1  #!/usr/bin/env python 
  2  # Copyright 2013, 2016 by Iddo Friedberg idoerg@gmail.com 
  3  # All rights reserved. 
  4  # This code is part of the Biopython distribution and governed by its 
  5  # license.  Please see the LICENSE file that should have been included 
  6  # as part of this package. 
  7  """Parsers for the GAF, GPA and GPI formats from UniProt-GOA. 
  8   
  9  Uniprot-GOA README + GAF format description: 
 10  ftp://ftp.ebi.ac.uk/pub/databases/GO/goa/UNIPROT/README 
 11   
 12  GAF formats: 
 13  http://www.geneontology.org/GO.format.annotation.shtml 
 14  gp_association (GPA format) README: 
 15  ftp://ftp.ebi.ac.uk/pub/databases/GO/goa/UNIPROT/gp_association_readme 
 16   
 17  gp_information (GPI format) README: 
 18  ftp://ftp.ebi.ac.uk/pub/databases/GO/goa/UNIPROT/gp_information_readme 
 19  """ 
 20   
 21  import copy 
 22  import sys 
 23   
 24  from Bio._py3k import zip 
 25   
 26   
 27  # GAF: GO Annotation Format 
 28  # 
 29  # GAF version 2.0 
 30   
 31  GAF20FIELDS = ['DB', 
 32          'DB_Object_ID', 
 33          'DB_Object_Symbol', 
 34          'Qualifier', 
 35          'GO_ID', 
 36          'DB:Reference', 
 37          'Evidence', 
 38          'With', 
 39          'Aspect', 
 40          'DB_Object_Name', 
 41          'Synonym', 
 42          'DB_Object_Type', 
 43          'Taxon_ID', 
 44          'Date', 
 45          'Assigned_By', 
 46          'Annotation_Extension', 
 47          'Gene_Product_Form_ID'] 
 48   
 49  # GAF version 1.0 
 50  GAF10FIELDS = ['DB', 
 51          'DB_Object_ID', 
 52          'DB_Object_Symbol', 
 53          'Qualifier', 
 54          'GO_ID', 
 55          'DB:Reference', 
 56          'Evidence', 
 57          'With', 
 58          'Aspect', 
 59          'DB_Object_Name', 
 60          'Synonym', 
 61          'DB_Object_Type', 
 62          'Taxon_ID', 
 63          'Date', 
 64          'Assigned_By'] 
 65   
 66   
 67  # GPA version 1.0 
 68  GPA10FIELDS = [ 
 69        'DB', 
 70        'DB_Object_ID', 
 71        'Qualifier', 
 72        'GO_ID', 
 73        'DB:Reference', 
 74        'Evidence code', 
 75        'With', 
 76        'Interacting_taxon_ID', 
 77        'Date', 
 78        'Assigned_by', 
 79        'Annotation_Extension', 
 80        'Spliceform_ID'] 
 81   
 82  # GPA version 1.1 
 83  GPA11FIELDS = [ 
 84        'DB', 
 85        'DB_Object_ID', 
 86        'Qualifier', 
 87        'GO_ID', 
 88        'DB:Reference', 
 89        'ECO_Evidence_code', 
 90        'With', 
 91        'Interacting_taxon_ID', 
 92        'Date', 
 93        'Assigned_by', 
 94        'Annotation Extension', 
 95        'Annotation_Properties'] 
 96   
 97  # GPI version 1.0 
 98  GPI10FIELDS = [ 
 99        'DB', 
100        'DB_subset', 
101        'DB_Object_ID', 
102        'DB_Object_Symbol', 
103        'DB_Object_Name', 
104        'DB_Object_Synonym', 
105        'DB_Object_Type', 
106        'Taxon', 
107        'Annotation_Target_Set', 
108        'Annotation_Completed', 
109        'Parent_Object_ID'] 
110   
111  # GPI version 1.1 
112  GPI11FIELDS = [ 
113        'DB_Object_ID', 
114        'DB_Object_Symbol', 
115        'DB_Object_Name', 
116        'DB_Object_Synonym', 
117        'DB_Object_Type', 
118        'Taxon', 
119        'Parent_Object_ID', 
120        'DB_Xref', 
121        'Gene_Product_Properties', 
122        'Annotation_Target_Set', 
123        'GO_Annotation_Complete'] 
124   
125   
126 -def _gpi10iterator(handle):
127 """Read GPI 1.0 format files (PRIVATE). 128 129 This iterator is used to read a gp_information.goa_uniprot 130 file which is in the GPI 1.0 format. 131 """ 132 for inline in handle: 133 if inline[0] == '!': 134 continue 135 inrec = inline.rstrip('\n').split('\t') 136 if len(inrec) == 1: 137 continue 138 inrec[5] = inrec[5].split('|') # DB_Object_Synonym(s) 139 inrec[8] = inrec[8].split('|') # Annotation_Target_Set 140 yield dict(zip(GPI10FIELDS, inrec))
141 142
143 -def _gpi11iterator(handle):
144 """Read GPI 1.0 format files (PRIVATE). 145 146 This iterator is used to read a gp_information.goa_uniprot 147 file which is in the GPI 1.0 format. 148 """ 149 for inline in handle: 150 if inline[0] == '!': 151 continue 152 inrec = inline.rstrip('\n').split('\t') 153 if len(inrec) == 1: 154 continue 155 inrec[2] = inrec[2].split('|') # DB_Object_Name 156 inrec[3] = inrec[3].split('|') # DB_Object_Synonym(s) 157 inrec[7] = inrec[7].split('|') # DB_Xref(s) 158 inrec[8] = inrec[8].split('|') # Properties 159 yield dict(zip(GPI11FIELDS, inrec))
160 161
162 -def gpi_iterator(handle):
163 """Read GPI format files. 164 165 This function should be called to read a 166 gp_information.goa_uniprot file. At the moment, there is 167 only one format, but this may change, so 168 this function is a placeholder a future wrapper. 169 """ 170 inline = handle.readline() 171 if inline.strip() == '!gpi-version: 1.1': 172 sys.stderr.write("gpi 1.1\n") 173 return _gpi11iterator(handle) 174 elif inline.strip() == '!gpi-version: 1.0': 175 sys.stderr.write("gpi 1.0\n") 176 return _gpi10iterator(handle) 177 elif inline.strip() == '!gpi-version: 2.1': 178 sys.stderr.write("gpi 2.1\n") 179 return _gpi20iterator(handle) 180 else: 181 raise ValueError('Unknown GPI version {0}\n'.format(inline))
182 183
184 -def _gpa10iterator(handle):
185 """Read GPA 1.0 format files (PRIVATE). 186 187 This iterator is used to read a gp_association.* 188 file which is in the GPA 1.0 format. Do not call directly. Rather, 189 use the gpaiterator function. 190 """ 191 192 for inline in handle: 193 if inline[0] == '!': 194 continue 195 inrec = inline.rstrip('\n').split('\t') 196 if len(inrec) == 1: 197 continue 198 inrec[2] = inrec[2].split('|') # Qualifier 199 inrec[4] = inrec[4].split('|') # DB:Reference(s) 200 inrec[6] = inrec[6].split('|') # With 201 inrec[10] = inrec[10].split('|') # Annotation extension 202 yield dict(zip(GPA10FIELDS, inrec))
203 204
205 -def _gpa11iterator(handle):
206 """Read GPA 1.1 format files (PRIVATE). 207 208 This iterator is used to read a gp_association.goa_uniprot 209 file which is in the GPA 1.1 format. Do not call directly. Rather 210 use the gpa_iterator function 211 """ 212 for inline in handle: 213 if inline[0] == '!': 214 continue 215 inrec = inline.rstrip('\n').split('\t') 216 if len(inrec) == 1: 217 continue 218 inrec[2] = inrec[2].split('|') # Qualifier 219 inrec[4] = inrec[4].split('|') # DB:Reference(s) 220 inrec[6] = inrec[6].split('|') # With 221 inrec[10] = inrec[10].split('|') # Annotation extension 222 yield dict(zip(GPA11FIELDS, inrec))
223 224
225 -def gpa_iterator(handle):
226 """Wrapper function: read GPA format files. 227 228 This function should be called to read a 229 gene_association.goa_uniprot file. Reads the first record and 230 returns a gpa 1.1 or a gpa 1.0 iterator as needed 231 """ 232 inline = handle.readline() 233 if inline.strip() == '!gpa-version: 1.1': 234 sys.stderr.write("gpa 1.1\n") 235 return _gpa11iterator(handle) 236 elif inline.strip() == '!gpa-version: 1.0': 237 sys.stderr.write("gpa 1.0\n") 238 return _gpa10iterator(handle) 239 else: 240 raise ValueError('Unknown GPA version {0}\n'.format(inline))
241 242
243 -def _gaf20iterator(handle):
244 for inline in handle: 245 if inline[0] == '!': 246 continue 247 inrec = inline.rstrip('\n').split('\t') 248 if len(inrec) == 1: 249 continue 250 inrec[3] = inrec[3].split('|') # Qualifier 251 inrec[5] = inrec[5].split('|') # DB:reference(s) 252 inrec[7] = inrec[7].split('|') # With || From 253 inrec[10] = inrec[10].split('|') # Synonym 254 inrec[12] = inrec[12].split('|') # Taxon 255 yield dict(zip(GAF20FIELDS, inrec))
256 257
258 -def _gaf10iterator(handle):
259 for inline in handle: 260 if inline[0] == '!': 261 continue 262 inrec = inline.rstrip('\n').split('\t') 263 if len(inrec) == 1: 264 continue 265 inrec[3] = inrec[3].split('|') # Qualifier 266 inrec[5] = inrec[5].split('|') # DB:reference(s) 267 inrec[7] = inrec[7].split('|') # With || From 268 inrec[10] = inrec[10].split('|') # Synonym 269 inrec[12] = inrec[12].split('|') # Taxon 270 yield dict(zip(GAF10FIELDS, inrec))
271 272
273 -def _gaf10byproteiniterator(handle):
274 cur_id = None 275 id_rec_list = [] 276 for inline in handle: 277 if inline[0] == '!': 278 continue 279 inrec = inline.rstrip('\n').split('\t') 280 if len(inrec) == 1: 281 continue 282 inrec[3] = inrec[3].split('|') # Qualifier 283 inrec[5] = inrec[5].split('|') # DB:reference(s) 284 inrec[7] = inrec[7].split('|') # With || From 285 inrec[10] = inrec[10].split('|') # Synonym 286 inrec[12] = inrec[12].split('|') # Taxon 287 cur_rec = dict(zip(GAF10FIELDS, inrec)) 288 if cur_rec['DB_Object_ID'] != cur_id and cur_id: 289 ret_list = copy.copy(id_rec_list) 290 id_rec_list = [cur_rec] 291 cur_id = cur_rec['DB_Object_ID'] 292 yield ret_list 293 else: 294 cur_id = cur_rec['DB_Object_ID'] 295 id_rec_list.append(cur_rec)
296 297
298 -def _gaf20byproteiniterator(handle):
299 cur_id = None 300 id_rec_list = [] 301 for inline in handle: 302 if inline[0] == '!': 303 continue 304 inrec = inline.rstrip('\n').split('\t') 305 if len(inrec) == 1: 306 continue 307 inrec[3] = inrec[3].split('|') # Qualifier 308 inrec[5] = inrec[5].split('|') # DB:reference(s) 309 inrec[7] = inrec[7].split('|') # With || From 310 inrec[10] = inrec[10].split('|') # Synonym 311 inrec[12] = inrec[12].split('|') # Taxon 312 cur_rec = dict(zip(GAF20FIELDS, inrec)) 313 if cur_rec['DB_Object_ID'] != cur_id and cur_id: 314 ret_list = copy.copy(id_rec_list) 315 id_rec_list = [cur_rec] 316 cur_id = cur_rec['DB_Object_ID'] 317 yield ret_list 318 else: 319 cur_id = cur_rec['DB_Object_ID'] 320 id_rec_list.append(cur_rec)
321 322
323 -def gafbyproteiniterator(handle):
324 """Iterates over records in a gene association file. 325 326 Returns a list of all consecutive records with the same DB_Object_ID 327 This function should be called to read a 328 gene_association.goa_uniprot file. Reads the first record and 329 returns a gaf 2.0 or a gaf 1.0 iterator as needed 330 2016-04-09: added GAF 2.1 iterator & fixed bug in iterator assignment 331 In the meantime GAF 2.1 uses the GAF 2.0 iterator 332 """ 333 inline = handle.readline() 334 if inline.strip() == '!gaf-version: 2.0': 335 sys.stderr.write("gaf 2.0\n") 336 return _gaf20byproteiniterator(handle) 337 elif inline.strip() == '!gaf-version: 1.0': 338 sys.stderr.write("gaf 1.0\n") 339 return _gaf10byproteiniterator(handle) 340 elif inline.strip() == '!gaf-version: 2.1': 341 # Handle GAF 2.1 as GAF 2.0 for now TODO: fix 342 sys.stderr.write("gaf 2.1\n") 343 return _gaf20byproteiniterator(handle) 344 else: 345 raise ValueError('Unknown GAF version {0}\n'.format(inline))
346 347
348 -def gafiterator(handle):
349 """Iterate pver a GAF 1.0 or 2.0 file. 350 351 This function should be called to read a 352 gene_association.goa_uniprot file. Reads the first record and 353 returns a gaf 2.0 or a gaf 1.0 iterator as needed 354 """ 355 inline = handle.readline() 356 if inline.strip() == '!gaf-version: 2.0': 357 sys.stderr.write("gaf 2.0\n") 358 return _gaf20iterator(handle) 359 elif inline.strip() == '!gaf-version: 2.1': 360 sys.stderr.write("gaf 2.1\n") 361 # Handle GAF 2.1 as GAF 2.0 for now. TODO: fix 362 return _gaf20iterator(handle) 363 elif inline.strip() == '!gaf-version: 1.0': 364 sys.stderr.write("gaf 1.0\n") 365 return _gaf10iterator(handle) 366 else: 367 raise ValueError('Unknown GAF version {0}\n'.format(inline))
368 369
370 -def writerec(outrec, handle, fields=GAF20FIELDS):
371 """Write a single UniProt-GOA record to an output stream. 372 373 Caller should know the format version. Default: gaf-2.0 374 If header has a value, then it is assumed this is the first record, 375 a header is written. 376 """ 377 outstr = '' 378 for field in fields[:-1]: 379 if isinstance(outrec[field], list): 380 for subfield in outrec[field]: 381 outstr += subfield + '|' 382 outstr = outstr[:-1] + '\t' 383 else: 384 outstr += outrec[field] + '\t' 385 outstr += outrec[fields[-1]] + '\n' 386 handle.write("%s" % outstr)
387 388
389 -def writebyproteinrec(outprotrec, handle, fields=GAF20FIELDS):
390 """Write a list of GAF records to an output stream. 391 392 Caller should know the format version. Default: gaf-2.0 393 If header has a value, then it is assumed this is the first record, 394 a header is written. Typically the list is the one read by fafbyproteinrec, which 395 contains all consecutive lines with the same DB_Object_ID 396 """ 397 for outrec in outprotrec: 398 writerec(outrec, handle, fields=fields)
399 400
401 -def record_has(inrec, fieldvals):
402 """Accepts a record, and a dictionary of field values. 403 404 The format is {'field_name': set([val1, val2])}. 405 If any field in the record has a matching value, the function returns 406 True. Otherwise, returns False. 407 """ 408 retval = False 409 for field in fieldvals: 410 if isinstance(inrec[field], str): 411 set1 = set([inrec[field]]) 412 else: 413 set1 = set(inrec[field]) 414 if (set1 & fieldvals[field]): 415 retval = True 416 break 417 return retval
418 419 420 if __name__ == '__main__': 421 """Example: read and filter a GAF file. 422 423 Write only S. cerevisiae records, but remove all 424 records with IEA evidence 425 """ 426 banned = {'Evidence': set(['IEA', 'EXP'])} 427 allowed = {'Taxon_ID': set(['taxon:4932'])} 428 for inrec in gafiterator(open(sys.argv[1])): 429 if record_has(inrec, allowed) and \ 430 not record_has(inrec, banned): 431 writerec(inrec, sys.stdout, GAF10FIELDS) 432