Package Bio :: Package UniProt :: Module GOA
[hide private]
[frames] | no frames]

Source Code for Module Bio.UniProt.GOA

  1  #!/usr/bin/env python 
  2  # Copyright 2013 by Iddo Friedberg idoerg@gmail.com 
  3  # All rights reserved. 
  4  # This code is part of the Biopython distribution and governed by its 
  5  # license.  Please see the LICENSE file that should have been included 
  6  # as part of this package. 
  7  """Parsers for the GAF, GPA and GPI formats from UniProt-GOA. 
  8   
  9  Uniprot-GOA README + GAF format description: 
 10  ftp://ftp.ebi.ac.uk/pub/databases/GO/goa/UNIPROT/README 
 11   
 12  GAF formats: 
 13  http://www.geneontology.org/GO.format.annotation.shtml 
 14  gp_association (GPA format) README: 
 15  ftp://ftp.ebi.ac.uk/pub/databases/GO/goa/UNIPROT/gp_association_readme 
 16   
 17  gp_information (GPI format) README: 
 18  ftp://ftp.ebi.ac.uk/pub/databases/GO/goa/UNIPROT/gp_information_readme 
 19  """ 
 20   
 21  import copy 
 22  import sys 
 23   
 24  from Bio._py3k import zip 
 25   
 26   
 27  # GAF: GO Annotation Format 
 28  # 
 29  # GAF version 2.0 
 30   
 31  GAF20FIELDS = ['DB', 
 32          'DB_Object_ID', 
 33          'DB_Object_Symbol', 
 34          'Qualifier', 
 35          'GO_ID', 
 36          'DB:Reference', 
 37          'Evidence', 
 38          'With', 
 39          'Aspect', 
 40          'DB_Object_Name', 
 41          'Synonym', 
 42          'DB_Object_Type', 
 43          'Taxon_ID', 
 44          'Date', 
 45          'Assigned_By', 
 46          'Annotation_Extension', 
 47          'Gene_Product_Form_ID'] 
 48   
 49  # GAF version 1.0 
 50  GAF10FIELDS = ['DB', 
 51          'DB_Object_ID', 
 52          'DB_Object_Symbol', 
 53          'Qualifier', 
 54          'GO_ID', 
 55          'DB:Reference', 
 56          'Evidence', 
 57          'With', 
 58          'Aspect', 
 59          'DB_Object_Name', 
 60          'Synonym', 
 61          'DB_Object_Type', 
 62          'Taxon_ID', 
 63          'Date', 
 64          'Assigned_By'] 
 65   
 66   
 67  # GPA version 1.0 
 68  GPA10FIELDS = [ 
 69        'DB', 
 70        'DB_Object_ID', 
 71        'Qualifier', 
 72        'GO_ID', 
 73        'DB:Reference', 
 74        'Evidence code', 
 75        'With', 
 76        'Interacting_taxon_ID', 
 77        'Date', 
 78        'Assigned_by', 
 79        'Annotation_Extension', 
 80        'Spliceform_ID'] 
 81   
 82  # GPA version 1.1 
 83  GPA11FIELDS = [ 
 84        'DB', 
 85        'DB_Object_ID', 
 86        'Qualifier', 
 87        'GO_ID', 
 88        'DB:Reference', 
 89        'ECO_Evidence_code', 
 90        'With', 
 91        'Interacting_taxon_ID', 
 92        'Date', 
 93        'Assigned_by', 
 94        'Annotation Extension', 
 95        'Annotation_Properties'] 
 96   
 97  # GPI version 1.0 
 98  GPI10FIELDS = [ 
 99        'DB', 
100        'DB_subset', 
101        'DB_Object_ID', 
102        'DB_Object_Symbol', 
103        'DB_Object_Name', 
104        'DB_Object_Synonym', 
105        'DB_Object_Type', 
106        'Taxon', 
107        'Annotation_Target_Set', 
108        'Annotation_Completed', 
109        'Parent_Object_ID'] 
110   
111  # GPI version 1.1 
112  GPI11FIELDS = [ 
113        'DB_Object_ID', 
114        'DB_Object_Symbol', 
115        'DB_Object_Name', 
116        'DB_Object_Synonym', 
117        'DB_Object_Type', 
118        'Taxon', 
119        'Parent_Object_ID', 
120        'DB_Xref', 
121        'Gene_Product_Properties', 
122        'Annotation_Target_Set', 
123        'GO_Annotation_Complete'] 
124   
125   
126 -def _gpi10iterator(handle):
127 """Read GPI 1.0 format files (PRIVATE). 128 129 This iterator is used to read a gp_information.goa_uniprot 130 file which is in the GPI 1.0 format. 131 """ 132 for inline in handle: 133 if inline[0] == '!': 134 continue 135 inrec = inline.rstrip('\n').split('\t') 136 if len(inrec) == 1: 137 continue 138 inrec[5] = inrec[5].split('|') # DB_Object_Synonym(s) 139 inrec[8] = inrec[8].split('|') # Annotation_Target_Set 140 yield dict(zip(GPI10FIELDS, inrec))
141 142
143 -def _gpi11iterator(handle):
144 """Read GPI 1.0 format files (PRIVATE). 145 146 This iterator is used to read a gp_information.goa_uniprot 147 file which is in the GPI 1.0 format. 148 """ 149 for inline in handle: 150 if inline[0] == '!': 151 continue 152 inrec = inline.rstrip('\n').split('\t') 153 if len(inrec) == 1: 154 continue 155 inrec[2] = inrec[2].split('|') # DB_Object_Name 156 inrec[3] = inrec[3].split('|') # DB_Object_Synonym(s) 157 inrec[7] = inrec[7].split('|') # DB_Xref(s) 158 inrec[8] = inrec[8].split('|') # Properties 159 yield dict(zip(GPI11FIELDS, inrec))
160 161
162 -def gpi_iterator(handle):
163 """Read GPI format files. 164 165 This function should be called to read a 166 gp_information.goa_uniprot file. At the moment, there is 167 only one format, but this may change, so 168 this function is a placeholder a future wrapper. 169 """ 170 inline = handle.readline() 171 if inline.strip() == '!gpi-version: 1.1': 172 sys.stderr.write("gpi 1.1\n") 173 return _gpi11iterator(handle) 174 else: 175 sys.stderr.write("gpi 1.0\n") 176 return _gpi10iterator(handle)
177 178
179 -def _gpa10iterator(handle):
180 """Read GPA 1.0 format files (PRIVATE). 181 182 This iterator is used to read a gp_association.* 183 file which is in the GPA 1.0 format. Do not call directly. Rather, 184 use the gpaiterator function. 185 """ 186 187 for inline in handle: 188 if inline[0] == '!': 189 continue 190 inrec = inline.rstrip('\n').split('\t') 191 if len(inrec) == 1: 192 continue 193 inrec[2] = inrec[2].split('|') # Qualifier 194 inrec[4] = inrec[4].split('|') # DB:Reference(s) 195 inrec[6] = inrec[6].split('|') # With 196 inrec[10] = inrec[10].split('|') # Annotation extension 197 yield dict(zip(GPA10FIELDS, inrec))
198 199
200 -def _gpa11iterator(handle):
201 """Read GPA 1.1 format files (PRIVATE). 202 203 This iterator is used to read a gp_association.goa_uniprot 204 file which is in the GPA 1.1 format. Do not call directly. Rather 205 use the gpa_iterator function 206 """ 207 for inline in handle: 208 if inline[0] == '!': 209 continue 210 inrec = inline.rstrip('\n').split('\t') 211 if len(inrec) == 1: 212 continue 213 inrec[2] = inrec[2].split('|') # Qualifier 214 inrec[4] = inrec[4].split('|') # DB:Reference(s) 215 inrec[6] = inrec[6].split('|') # With 216 inrec[10] = inrec[10].split('|') # Annotation extension 217 yield dict(zip(GPA11FIELDS, inrec))
218 219
220 -def gpa_iterator(handle):
221 """Wrapper function: read GPA format files. 222 223 This function should be called to read a 224 gene_association.goa_uniprot file. Reads the first record and 225 returns a gpa 1.1 or a gpa 1.0 iterator as needed 226 """ 227 inline = handle.readline() 228 if inline.strip() == '!gpa-version: 1.1': 229 sys.stderr.write("gpa 1.1\n") 230 return _gpa11iterator(handle) 231 else: 232 sys.stderr.write("gpa 1.0\n") 233 return _gpa10iterator(handle)
234 235
236 -def _gaf20iterator(handle):
237 for inline in handle: 238 if inline[0] == '!': 239 continue 240 inrec = inline.rstrip('\n').split('\t') 241 if len(inrec) == 1: 242 continue 243 inrec[3] = inrec[3].split('|') # Qualifier 244 inrec[5] = inrec[5].split('|') # DB:reference(s) 245 inrec[7] = inrec[7].split('|') # With || From 246 inrec[10] = inrec[10].split('|') # Synonym 247 inrec[12] = inrec[12].split('|') # Taxon 248 yield dict(zip(GAF20FIELDS, inrec))
249 250
251 -def _gaf10iterator(handle):
252 for inline in handle: 253 if inline[0] == '!': 254 continue 255 inrec = inline.rstrip('\n').split('\t') 256 if len(inrec) == 1: 257 continue 258 inrec[3] = inrec[3].split('|') # Qualifier 259 inrec[5] = inrec[5].split('|') # DB:reference(s) 260 inrec[7] = inrec[7].split('|') # With || From 261 inrec[10] = inrec[10].split('|') # Synonym 262 inrec[12] = inrec[12].split('|') # Taxon 263 yield dict(zip(GAF10FIELDS, inrec))
264 265
266 -def _gaf10byproteiniterator(handle):
267 cur_id = None 268 id_rec_list = [] 269 for inline in handle: 270 if inline[0] == '!': 271 continue 272 inrec = inline.rstrip('\n').split('\t') 273 if len(inrec) == 1: 274 continue 275 inrec[3] = inrec[3].split('|') # Qualifier 276 inrec[5] = inrec[5].split('|') # DB:reference(s) 277 inrec[7] = inrec[7].split('|') # With || From 278 inrec[10] = inrec[10].split('|') # Synonym 279 inrec[12] = inrec[12].split('|') # Taxon 280 cur_rec = dict(zip(GAF10FIELDS, inrec)) 281 if cur_rec['DB_Object_ID'] != cur_id and cur_id: 282 ret_list = copy.copy(id_rec_list) 283 id_rec_list = [cur_rec] 284 cur_id = cur_rec['DB_Object_ID'] 285 yield ret_list 286 else: 287 cur_id = cur_rec['DB_Object_ID'] 288 id_rec_list.append(cur_rec)
289 290
291 -def _gaf20byproteiniterator(handle):
292 cur_id = None 293 id_rec_list = [] 294 for inline in handle: 295 if inline[0] == '!': 296 continue 297 inrec = inline.rstrip('\n').split('\t') 298 if len(inrec) == 1: 299 continue 300 inrec[3] = inrec[3].split('|') # Qualifier 301 inrec[5] = inrec[5].split('|') # DB:reference(s) 302 inrec[7] = inrec[7].split('|') # With || From 303 inrec[10] = inrec[10].split('|') # Synonym 304 inrec[12] = inrec[12].split('|') # Taxon 305 cur_rec = dict(zip(GAF20FIELDS, inrec)) 306 if cur_rec['DB_Object_ID'] != cur_id and cur_id: 307 ret_list = copy.copy(id_rec_list) 308 id_rec_list = [cur_rec] 309 cur_id = cur_rec['DB_Object_ID'] 310 yield ret_list 311 else: 312 cur_id = cur_rec['DB_Object_ID'] 313 id_rec_list.append(cur_rec)
314 315
316 -def gafbyproteiniterator(handle):
317 """Iterates over records in a gene association file. 318 319 Returns a list of all consecutive records with the same DB_Object_ID 320 This function should be called to read a 321 gene_association.goa_uniprot file. Reads the first record and 322 returns a gaf 2.0 or a gaf 1.0 iterator as needed 323 """ 324 inline = handle.readline() 325 if inline.strip() == '!gaf-version: 2.0': 326 sys.stderr.write("gaf 2.0\n") 327 return _gaf20byproteiniterator(handle) 328 else: 329 sys.stderr.write("gaf 1.0\n") 330 return _gaf10byproteiniterator(handle)
331 332
333 -def gafiterator(handle):
334 """Iterate pver a GAF 1.0 or 2.0 file. 335 336 This function should be called to read a 337 gene_association.goa_uniprot file. Reads the first record and 338 returns a gaf 2.0 or a gaf 1.0 iterator as needed 339 """ 340 inline = handle.readline() 341 if inline.strip() == '!gaf-version: 2.0': 342 sys.stderr.write("gaf 2.0\n") 343 return _gaf20iterator(handle) 344 else: 345 sys.stderr.write("gaf 1.0\n") 346 return _gaf10iterator(handle)
347 348
349 -def writerec(outrec, handle, fields=GAF20FIELDS):
350 """Write a single UniProt-GOA record to an output stream. 351 352 Caller should know the format version. Default: gaf-2.0 353 If header has a value, then it is assumed this is the first record, 354 a header is written. 355 """ 356 outstr = '' 357 for field in fields[:-1]: 358 if isinstance(outrec[field], list): 359 for subfield in outrec[field]: 360 outstr += subfield + '|' 361 outstr = outstr[:-1] + '\t' 362 else: 363 outstr += outrec[field] + '\t' 364 outstr += outrec[fields[-1]] + '\n' 365 handle.write("%s" % outstr)
366 367
368 -def writebyproteinrec(outprotrec, handle, fields=GAF20FIELDS):
369 """Write a list of GAF records to an output stream. 370 371 Caller should know the format version. Default: gaf-2.0 372 If header has a value, then it is assumed this is the first record, 373 a header is written. Typically the list is the one read by fafbyproteinrec, which 374 contains all consecutive lines with the same DB_Object_ID 375 """ 376 for outrec in outprotrec: 377 writerec(outrec, handle, fields=fields)
378 379
380 -def record_has(inrec, fieldvals):
381 """Accepts a record, and a dictionary of field values. 382 383 The format is {'field_name': set([val1, val2])}. 384 If any field in the record has a matching value, the function returns 385 True. Otherwise, returns False. 386 """ 387 retval = False 388 for field in fieldvals: 389 if isinstance(inrec[field], str): 390 set1 = set([inrec[field]]) 391 else: 392 set1 = set(inrec[field]) 393 if (set1 & fieldvals[field]): 394 retval = True 395 break 396 return retval
397 398 399 if __name__ == '__main__': 400 """Example: read and filter a GAF file. 401 402 Write only S. cerevisiae records, but remove all 403 records with IEA evidence 404 """ 405 banned = {'Evidence': set(['IEA', 'EXP'])} 406 allowed = {'Taxon_ID': set(['taxon:4932'])} 407 for inrec in gafiterator(open(sys.argv[1])): 408 if record_has(inrec, allowed) and \ 409 not record_has(inrec, banned): 410 writerec(inrec, sys.stdout, GAF10FIELDS) 411