Package Bio :: Package GenBank :: Module Record
[hide private]
[frames] | no frames]

Source Code for Module Bio.GenBank.Record

  1  # This code is part of the Biopython distribution and governed by its 
  2  # license.  Please see the LICENSE file that should have been included 
  3  # as part of this package. 
  4  # 
  5   
  6  """Hold GenBank data in a straightforward format. 
  7   
  8  classes: 
  9   
 10      - Record - All of the information in a GenBank record. 
 11      - Reference - hold reference data for a record. 
 12      - Feature - Hold the information in a Feature Table. 
 13      - Qualifier - Qualifiers on a Feature. 
 14   
 15  17-MAR-2009: added support for WGS and WGS_SCAFLD lines.  Ying Huang & Iddo Friedberg 
 16  """ 
 17  # local stuff 
 18  import Bio.GenBank 
 19   
 20  __docformat__ = "restructuredtext en" 
 21   
22 -def _wrapped_genbank(information, indent, wrap_space=1, split_char=" "):
23 """Write a line of GenBank info that can wrap over multiple lines. 24 25 This takes a line of information which can potentially wrap over 26 multiple lines, and breaks it up with carriage returns and 27 indentation so it fits properly into a GenBank record. 28 29 Arguments: 30 31 - information - The string holding the information we want 32 wrapped in GenBank method. 33 34 - indent - The indentation on the lines we are writing. 35 36 - wrap_space - Whether or not to wrap only on spaces in the 37 information. 38 39 - split_char - A specific character to split the lines on. By default 40 spaces are used. 41 """ 42 info_length = Record.GB_LINE_LENGTH - indent 43 44 if not information: 45 # GenBank files use "." for missing data 46 return ".\n" 47 48 if wrap_space: 49 info_parts = information.split(split_char) 50 else: 51 cur_pos = 0 52 info_parts = [] 53 while cur_pos < len(information): 54 info_parts.append(information[cur_pos: cur_pos + info_length]) 55 cur_pos += info_length 56 57 # first get the information string split up by line 58 output_parts = [] 59 cur_part = "" 60 for info_part in info_parts: 61 if len(cur_part) + 1 + len(info_part) > info_length: 62 if cur_part: 63 if split_char != " ": 64 cur_part += split_char 65 output_parts.append(cur_part) 66 cur_part = info_part 67 else: 68 if cur_part == "": 69 cur_part = info_part 70 else: 71 cur_part += split_char + info_part 72 73 # add the last bit of information to the output 74 if cur_part: 75 output_parts.append(cur_part) 76 77 # now format the information string for return 78 output_info = output_parts[0] + "\n" 79 for output_part in output_parts[1:]: 80 output_info += " " * indent + output_part + "\n" 81 82 return output_info
83 84
85 -def _indent_genbank(information, indent):
86 """Write out information with the specified indent. 87 88 Unlike _wrapped_genbank, this function makes no attempt to wrap 89 lines -- it assumes that the information already has newlines in the 90 appropriate places, and will add the specified indent to the start of 91 each line. 92 """ 93 # split the info into lines based on line breaks 94 info_parts = information.split("\n") 95 96 # the first line will have no indent 97 output_info = info_parts[0] + "\n" 98 for info_part in info_parts[1:]: 99 output_info += " " * indent + info_part + "\n" 100 101 return output_info
102 103
104 -class Record(object):
105 """Hold GenBank information in a format similar to the original record. 106 107 The Record class is meant to make data easy to get to when you are 108 just interested in looking at GenBank data. 109 110 Attributes: 111 112 - locus - The name specified after the LOCUS keyword in the GenBank 113 record. This may be the accession number, or a clone id or something else. 114 - size - The size of the record. 115 - residue_type - The type of residues making up the sequence in this 116 record. Normally something like RNA, DNA or PROTEIN, but may be as 117 esoteric as 'ss-RNA circular'. 118 - data_file_division - The division this record is stored under in 119 GenBank (ie. PLN -> plants; PRI -> humans, primates; BCT -> bacteria...) 120 - date - The date of submission of the record, in a form like '28-JUL-1998' 121 - accession - list of all accession numbers for the sequence. 122 - nid - Nucleotide identifier number. 123 - pid - Proteint identifier number 124 - version - The accession number + version (ie. AB01234.2) 125 - db_source - Information about the database the record came from 126 - gi - The NCBI gi identifier for the record. 127 - keywords - A list of keywords related to the record. 128 - segment - If the record is one of a series, this is info about which 129 segment this record is (something like '1 of 6'). 130 - source - The source of material where the sequence came from. 131 - organism - The genus and species of the organism (ie. 'Homo sapiens') 132 - taxonomy - A listing of the taxonomic classification of the organism, 133 starting general and getting more specific. 134 - references - A list of Reference objects. 135 - comment - Text with any kind of comment about the record. 136 - features - A listing of Features making up the feature table. 137 - base_counts - A string with the counts of bases for the sequence. 138 - origin - A string specifying info about the origin of the sequence. 139 - sequence - A string with the sequence itself. 140 - contig - A string of location information for a CONTIG in a RefSeq file 141 - project - The genome sequencing project numbers 142 (will be replaced by the dblink cross-references in 2009). 143 - dblinks - The genome sequencing project number(s) and other links. 144 (will replace the project information in 2009). 145 """ 146 # constants for outputting GenBank information 147 GB_LINE_LENGTH = 79 148 GB_BASE_INDENT = 12 149 GB_FEATURE_INDENT = 21 150 GB_INTERNAL_INDENT = 2 151 GB_OTHER_INTERNAL_INDENT = 3 152 GB_FEATURE_INTERNAL_INDENT = 5 153 GB_SEQUENCE_INDENT = 9 154 155 BASE_FORMAT = "%-" + str(GB_BASE_INDENT) + "s" 156 INTERNAL_FORMAT = " " * GB_INTERNAL_INDENT + "%-" + \ 157 str(GB_BASE_INDENT - GB_INTERNAL_INDENT) + "s" 158 OTHER_INTERNAL_FORMAT = " " * GB_OTHER_INTERNAL_INDENT + "%-" + \ 159 str(GB_BASE_INDENT - GB_OTHER_INTERNAL_INDENT) + \ 160 "s" 161 162 BASE_FEATURE_FORMAT = "%-" + str(GB_FEATURE_INDENT) + "s" 163 INTERNAL_FEATURE_FORMAT = " " * GB_FEATURE_INTERNAL_INDENT + "%-" + \ 164 str(GB_FEATURE_INDENT - 165 GB_FEATURE_INTERNAL_INDENT) + "s" 166 SEQUENCE_FORMAT = "%" + str(GB_SEQUENCE_INDENT) + "s" 167
168 - def __init__(self):
169 self.locus = '' 170 self.size = '' 171 self.residue_type = '' 172 self.data_file_division = '' 173 self.date = '' 174 self.definition = '' 175 self.accession = [] 176 self.nid = '' 177 self.pid = '' 178 self.version = '' 179 self.projects = [] 180 self.dblinks = [] 181 self.db_source = '' 182 self.gi = '' 183 self.keywords = [] 184 self.segment = '' 185 self.source = '' 186 self.organism = '' 187 self.taxonomy = [] 188 self.references = [] 189 self.comment = '' 190 self.features = [] 191 self.base_counts = '' 192 self.origin = '' 193 self.sequence = '' 194 self.contig = '' 195 self.primary=[] 196 self.wgs = '' 197 self.wgs_scafld = []
198
199 - def __str__(self):
200 """Provide a GenBank formatted output option for a Record. 201 202 The objective of this is to provide an easy way to read in a GenBank 203 record, modify it somehow, and then output it in 'GenBank format.' 204 We are striving to make this work so that a parsed Record that is 205 output using this function will look exactly like the original 206 record. 207 208 Much of the output is based on format description info at: 209 210 ftp://ncbi.nlm.nih.gov/genbank/gbrel.txt 211 """ 212 output = self._locus_line() 213 output += self._definition_line() 214 output += self._accession_line() 215 output += self._version_line() 216 output += self._project_line() 217 output += self._dblink_line() 218 output += self._nid_line() 219 output += self._pid_line() 220 output += self._keywords_line() 221 output += self._db_source_line() 222 output += self._segment_line() 223 output += self._source_line() 224 output += self._organism_line() 225 for reference in self.references: 226 output += str(reference) 227 output += self._comment_line() 228 output += self._features_line() 229 for feature in self.features: 230 output += str(feature) 231 output += self._base_count_line() 232 output += self._origin_line() 233 output += self._sequence_line() 234 output += self._wgs_line() 235 output += self._wgs_scafld_line() 236 output += self._contig_line() 237 output += "//" 238 return output
239
240 - def _locus_line(self):
241 """Provide the output string for the LOCUS line. 242 """ 243 output = "LOCUS" 244 output += " " * 7 # 6-12 spaces 245 output += "%-9s" % self.locus 246 output += " " # 22 space 247 output += "%7s" % self.size 248 if "PROTEIN" in self.residue_type: 249 output += " aa" 250 else: 251 output += " bp " 252 253 # treat circular types differently, since they'll have long residue 254 # types 255 if "circular" in self.residue_type: 256 output += "%17s" % self.residue_type 257 # second case: ss-DNA types of records 258 elif "-" in self.residue_type: 259 output += "%7s" % self.residue_type 260 output += " " * 10 # spaces for circular 261 else: 262 output += " " * 3 # spaces for stuff like ss- 263 output += "%-4s" % self.residue_type 264 output += " " * 10 # spaces for circular 265 266 output += " " * 2 267 output += "%3s" % self.data_file_division 268 output += " " * 7 # spaces for 56-63 269 output += "%11s" % self.date 270 output += "\n" 271 return output
272
273 - def _definition_line(self):
274 """Provide output for the DEFINITION line. 275 """ 276 output = Record.BASE_FORMAT % "DEFINITION" 277 output += _wrapped_genbank(self.definition, Record.GB_BASE_INDENT) 278 return output
279
280 - def _accession_line(self):
281 """Output for the ACCESSION line. 282 """ 283 if self.accession: 284 output = Record.BASE_FORMAT % "ACCESSION" 285 286 acc_info = "" 287 for accession in self.accession: 288 acc_info += "%s " % accession 289 # strip off an extra space at the end 290 acc_info = acc_info.rstrip() 291 output += _wrapped_genbank(acc_info, Record.GB_BASE_INDENT) 292 else: 293 output = "" 294 295 return output
296
297 - def _version_line(self):
298 """Output for the VERSION line. 299 """ 300 if self.version: 301 output = Record.BASE_FORMAT % "VERSION" 302 output += self.version 303 output += " GI:" 304 output += "%s\n" % self.gi 305 else: 306 output = "" 307 return output
308
309 - def _project_line(self):
310 output = "" 311 if len(self.projects) > 0: 312 output = Record.BASE_FORMAT % "PROJECT" 313 output += "%s\n" % " ".join(self.projects) 314 return output
315 323
324 - def _nid_line(self):
325 """Output for the NID line. Use of NID is obsolete in GenBank files. 326 """ 327 if self.nid: 328 output = Record.BASE_FORMAT % "NID" 329 output += "%s\n" % self.nid 330 else: 331 output = "" 332 return output
333
334 - def _pid_line(self):
335 """Output for PID line. Presumedly, PID usage is also obsolete. 336 """ 337 if self.pid: 338 output = Record.BASE_FORMAT % "PID" 339 output += "%s\n" % self.pid 340 else: 341 output = "" 342 return output
343
344 - def _keywords_line(self):
345 """Output for the KEYWORDS line. 346 """ 347 output = "" 348 if len(self.keywords) >= 0: 349 output += Record.BASE_FORMAT % "KEYWORDS" 350 keyword_info = "" 351 for keyword in self.keywords: 352 keyword_info += "%s; " % keyword 353 # replace the ; at the end with a period 354 keyword_info = keyword_info[:-2] 355 keyword_info += "." 356 357 output += _wrapped_genbank(keyword_info, 358 Record.GB_BASE_INDENT) 359 360 return output
361
362 - def _db_source_line(self):
363 """Output for DBSOURCE line. 364 """ 365 if self.db_source: 366 output = Record.BASE_FORMAT % "DBSOURCE" 367 output += "%s\n" % self.db_source 368 else: 369 output = "" 370 return output
371
372 - def _segment_line(self):
373 """Output for the SEGMENT line. 374 """ 375 output = "" 376 if self.segment: 377 output += Record.BASE_FORMAT % "SEGMENT" 378 output += _wrapped_genbank(self.segment, Record.GB_BASE_INDENT) 379 return output
380
381 - def _source_line(self):
382 """Output for SOURCE line on where the sample came from. 383 """ 384 output = Record.BASE_FORMAT % "SOURCE" 385 output += _wrapped_genbank(self.source, Record.GB_BASE_INDENT) 386 return output
387
388 - def _organism_line(self):
389 """Output for ORGANISM line with taxonomy info. 390 """ 391 output = Record.INTERNAL_FORMAT % "ORGANISM" 392 # Now that species names can be too long, this line can wrap (Bug 2591) 393 output += _wrapped_genbank(self.organism, Record.GB_BASE_INDENT) 394 output += " " * Record.GB_BASE_INDENT 395 taxonomy_info = "" 396 for tax in self.taxonomy: 397 taxonomy_info += "%s; " % tax 398 # replace the ; at the end with a period 399 taxonomy_info = taxonomy_info[:-2] 400 taxonomy_info += "." 401 output += _wrapped_genbank(taxonomy_info, Record.GB_BASE_INDENT) 402 403 return output
404
405 - def _comment_line(self):
406 """Output for the COMMENT lines. 407 """ 408 output = "" 409 if self.comment: 410 output += Record.BASE_FORMAT % "COMMENT" 411 output += _indent_genbank(self.comment, 412 Record.GB_BASE_INDENT) 413 return output
414
415 - def _features_line(self):
416 """Output for the FEATURES line. 417 """ 418 output = "" 419 if len(self.features) > 0: 420 output += Record.BASE_FEATURE_FORMAT % "FEATURES" 421 output += "Location/Qualifiers\n" 422 return output
423
424 - def _base_count_line(self):
425 """Output for the BASE COUNT line with base information. 426 """ 427 output = "" 428 if self.base_counts: 429 output += Record.BASE_FORMAT % "BASE COUNT " 430 # split up the base counts into their individual parts 431 count_parts = self.base_counts.split(" ") 432 while '' in count_parts: 433 count_parts.remove('') 434 # deal with the standard case, with a normal origin line 435 # like: 474 a 356 c 428 g 364 t 436 if len(count_parts) % 2 == 0: 437 while len(count_parts) > 0: 438 count_info = count_parts.pop(0) 439 count_type = count_parts.pop(0) 440 441 output += "%7s %s" % (count_info, count_type) 442 # deal with ugly ORIGIN lines like: 443 # 1311257 a2224835 c2190093 g1309889 t 444 # by just outputting the raw information 445 else: 446 output += self.base_counts 447 output += "\n" 448 return output
449
450 - def _origin_line(self):
451 """Output for the ORIGIN line 452 """ 453 output = "" 454 # only output the ORIGIN line if we have a sequence 455 if self.sequence: 456 output += Record.BASE_FORMAT % "ORIGIN" 457 if self.origin: 458 output += _wrapped_genbank(self.origin, 459 Record.GB_BASE_INDENT) 460 else: 461 output += "\n" 462 return output
463
464 - def _sequence_line(self):
465 """Output for all of the sequence. 466 """ 467 output = "" 468 if self.sequence: 469 cur_seq_pos = 0 470 while cur_seq_pos < len(self.sequence): 471 output += Record.SEQUENCE_FORMAT % str(cur_seq_pos + 1) 472 473 for section in range(6): 474 start_pos = cur_seq_pos + section * 10 475 end_pos = start_pos + 10 476 seq_section = self.sequence[start_pos:end_pos] 477 output += " %s" % seq_section.lower() 478 479 # stop looping if we are out of sequence 480 if end_pos > len(self.sequence): 481 break 482 483 output += "\n" 484 cur_seq_pos += 60 485 return output
486
487 - def _wgs_line(self):
488 output = "" 489 if self.wgs: 490 output += Record.BASE_FORMAT % "WGS" 491 output += self.wgs 492 return output
493
494 - def _wgs_scafld_line(self):
495 output = "" 496 if self.wgs_scafld: 497 output += Record.BASE_FORMAT % "WGS_SCAFLD" 498 output += self.wgs_scafld 499 return output
500
501 - def _contig_line(self):
502 """Output for CONTIG location information from RefSeq. 503 """ 504 output = "" 505 if self.contig: 506 output += Record.BASE_FORMAT % "CONTIG" 507 output += _wrapped_genbank(self.contig, 508 Record.GB_BASE_INDENT, split_char=',') 509 return output
510 511
512 -class Reference(object):
513 """Hold information from a GenBank reference. 514 515 Attributes: 516 517 - number - The number of the reference in the listing of references. 518 - bases - The bases in the sequence the reference refers to. 519 - authors - String with all of the authors. 520 - consrtm - Consortium the authors belong to. 521 - title - The title of the reference. 522 - journal - Information about the journal where the reference appeared. 523 - medline_id - The medline id for the reference. 524 - pubmed_id - The pubmed_id for the reference. 525 - remark - Free-form remarks about the reference. 526 """
527 - def __init__(self):
528 self.number = '' 529 self.bases = '' 530 self.authors = '' 531 self.consrtm = '' 532 self.title = '' 533 self.journal = '' 534 self.medline_id = '' 535 self.pubmed_id = '' 536 self.remark = ''
537
538 - def __str__(self):
539 output = self._reference_line() 540 output += self._authors_line() 541 output += self._consrtm_line() 542 output += self._title_line() 543 output += self._journal_line() 544 output += self._medline_line() 545 output += self._pubmed_line() 546 output += self._remark_line() 547 548 return output
549
550 - def _reference_line(self):
551 """Output for REFERENCE lines. 552 """ 553 output = Record.BASE_FORMAT % "REFERENCE" 554 if self.number: 555 if self.bases: 556 output += "%-3s" % self.number 557 output += "%s" % self.bases 558 else: 559 output += "%s" % self.number 560 561 output += "\n" 562 return output
563
564 - def _authors_line(self):
565 """Output for AUTHORS information. 566 """ 567 output = "" 568 if self.authors: 569 output += Record.INTERNAL_FORMAT % "AUTHORS" 570 output += _wrapped_genbank(self.authors, Record.GB_BASE_INDENT) 571 return output
572
573 - def _consrtm_line(self):
574 """Output for CONSRTM information. 575 """ 576 output = "" 577 if self.consrtm: 578 output += Record.INTERNAL_FORMAT % "CONSRTM" 579 output += _wrapped_genbank(self.consrtm, Record.GB_BASE_INDENT) 580 return output
581
582 - def _title_line(self):
583 """Output for TITLE information. 584 """ 585 output = "" 586 if self.title: 587 output += Record.INTERNAL_FORMAT % "TITLE" 588 output += _wrapped_genbank(self.title, Record.GB_BASE_INDENT) 589 return output
590
591 - def _journal_line(self):
592 """Output for JOURNAL information. 593 """ 594 output = "" 595 if self.journal: 596 output += Record.INTERNAL_FORMAT % "JOURNAL" 597 output += _wrapped_genbank(self.journal, Record.GB_BASE_INDENT) 598 return output
599
600 - def _medline_line(self):
601 """Output for MEDLINE information. 602 """ 603 output = "" 604 if self.medline_id: 605 output += Record.INTERNAL_FORMAT % "MEDLINE" 606 output += self.medline_id + "\n" 607 return output
608
609 - def _pubmed_line(self):
610 """Output for PUBMED information. 611 """ 612 output = "" 613 if self.pubmed_id: 614 output += Record.OTHER_INTERNAL_FORMAT % "PUBMED" 615 output += self.pubmed_id + "\n" 616 return output
617
618 - def _remark_line(self):
619 """Output for REMARK information. 620 """ 621 output = "" 622 if self.remark: 623 output += Record.INTERNAL_FORMAT % "REMARK" 624 output += _wrapped_genbank(self.remark, Record.GB_BASE_INDENT) 625 return output
626 627
628 -class Feature(object):
629 """Hold information about a Feature in the Feature Table of GenBank record. 630 631 Attributes: 632 633 - key - The key name of the featue (ie. source) 634 - location - The string specifying the location of the feature. 635 - qualfiers - A listing Qualifier objects in the feature. 636 """
637 - def __init__(self):
638 self.key = '' 639 self.location = '' 640 self.qualifiers = []
641
642 - def __str__(self):
643 output = Record.INTERNAL_FEATURE_FORMAT % self.key 644 output += _wrapped_genbank(self.location, Record.GB_FEATURE_INDENT, 645 split_char=',') 646 for qualifier in self.qualifiers: 647 output += " " * Record.GB_FEATURE_INDENT 648 649 # determine whether we can wrap on spaces 650 space_wrap = 1 651 for no_space_key in \ 652 Bio.GenBank._BaseGenBankConsumer.remove_space_keys: 653 if no_space_key in qualifier.key: 654 space_wrap = 0 655 656 output += _wrapped_genbank(qualifier.key + qualifier.value, 657 Record.GB_FEATURE_INDENT, space_wrap) 658 return output
659 660
661 -class Qualifier(object):
662 """Hold information about a qualifier in a GenBank feature. 663 664 Attributes: 665 666 - key - The key name of the qualifier (ie. /organism=) 667 - value - The value of the qualifier ("Dictyostelium discoideum"). 668 """
669 - def __init__(self):
670 self.key = '' 671 self.value = ''
672