Package Bio :: Package GenBank :: Module Record
[hide private]
[frames] | no frames]

Source Code for Module Bio.GenBank.Record

  1  # This code is part of the Biopython distribution and governed by its 
  2  # license.  Please see the LICENSE file that should have been included 
  3  # as part of this package. 
  4  # 
  5   
  6  """Hold GenBank data in a straightforward format. 
  7   
  8  classes: 
  9   
 10      - Record - All of the information in a GenBank record. 
 11      - Reference - hold reference data for a record. 
 12      - Feature - Hold the information in a Feature Table. 
 13      - Qualifier - Qualifiers on a Feature. 
 14   
 15  17-MAR-2009: added support for WGS and WGS_SCAFLD lines.  Ying Huang & Iddo Friedberg 
 16  """ 
 17  # local stuff 
 18  import Bio.GenBank 
 19   
 20   
21 -def _wrapped_genbank(information, indent, wrap_space=1, split_char=" "):
22 """Write a line of GenBank info that can wrap over multiple lines. 23 24 This takes a line of information which can potentially wrap over 25 multiple lines, and breaks it up with carriage returns and 26 indentation so it fits properly into a GenBank record. 27 28 Arguments: 29 30 - information - The string holding the information we want 31 wrapped in GenBank method. 32 33 - indent - The indentation on the lines we are writing. 34 35 - wrap_space - Whether or not to wrap only on spaces in the 36 information. 37 38 - split_char - A specific character to split the lines on. By default 39 spaces are used. 40 """ 41 info_length = Record.GB_LINE_LENGTH - indent 42 43 if not information: 44 # GenBank files use "." for missing data 45 return ".\n" 46 47 if wrap_space: 48 info_parts = information.split(split_char) 49 else: 50 cur_pos = 0 51 info_parts = [] 52 while cur_pos < len(information): 53 info_parts.append(information[cur_pos: cur_pos + info_length]) 54 cur_pos += info_length 55 56 # first get the information string split up by line 57 output_parts = [] 58 cur_part = "" 59 for info_part in info_parts: 60 if len(cur_part) + 1 + len(info_part) > info_length: 61 if cur_part: 62 if split_char != " ": 63 cur_part += split_char 64 output_parts.append(cur_part) 65 cur_part = info_part 66 else: 67 if cur_part == "": 68 cur_part = info_part 69 else: 70 cur_part += split_char + info_part 71 72 # add the last bit of information to the output 73 if cur_part: 74 output_parts.append(cur_part) 75 76 # now format the information string for return 77 output_info = output_parts[0] + "\n" 78 for output_part in output_parts[1:]: 79 output_info += " " * indent + output_part + "\n" 80 81 return output_info
82 83
84 -def _indent_genbank(information, indent):
85 """Write out information with the specified indent. 86 87 Unlike _wrapped_genbank, this function makes no attempt to wrap 88 lines -- it assumes that the information already has newlines in the 89 appropriate places, and will add the specified indent to the start of 90 each line. 91 """ 92 # split the info into lines based on line breaks 93 info_parts = information.split("\n") 94 95 # the first line will have no indent 96 output_info = info_parts[0] + "\n" 97 for info_part in info_parts[1:]: 98 output_info += " " * indent + info_part + "\n" 99 100 return output_info
101 102
103 -class Record(object):
104 """Hold GenBank information in a format similar to the original record. 105 106 The Record class is meant to make data easy to get to when you are 107 just interested in looking at GenBank data. 108 109 Attributes: 110 111 - locus - The name specified after the LOCUS keyword in the GenBank 112 record. This may be the accession number, or a clone id or something else. 113 - size - The size of the record. 114 - residue_type - The type of residues making up the sequence in this 115 record. Normally something like RNA, DNA or PROTEIN, but may be as 116 esoteric as 'ss-RNA circular'. 117 - data_file_division - The division this record is stored under in 118 GenBank (ie. PLN -> plants; PRI -> humans, primates; BCT -> bacteria...) 119 - date - The date of submission of the record, in a form like '28-JUL-1998' 120 - accession - list of all accession numbers for the sequence. 121 - nid - Nucleotide identifier number. 122 - pid - Proteint identifier number 123 - version - The accession number + version (ie. AB01234.2) 124 - db_source - Information about the database the record came from 125 - gi - The NCBI gi identifier for the record. 126 - keywords - A list of keywords related to the record. 127 - segment - If the record is one of a series, this is info about which 128 segment this record is (something like '1 of 6'). 129 - source - The source of material where the sequence came from. 130 - organism - The genus and species of the organism (ie. 'Homo sapiens') 131 - taxonomy - A listing of the taxonomic classification of the organism, 132 starting general and getting more specific. 133 - references - A list of Reference objects. 134 - comment - Text with any kind of comment about the record. 135 - features - A listing of Features making up the feature table. 136 - base_counts - A string with the counts of bases for the sequence. 137 - origin - A string specifying info about the origin of the sequence. 138 - sequence - A string with the sequence itself. 139 - contig - A string of location information for a CONTIG in a RefSeq file 140 - project - The genome sequencing project numbers 141 (will be replaced by the dblink cross-references in 2009). 142 - dblinks - The genome sequencing project number(s) and other links. 143 (will replace the project information in 2009). 144 """ 145 # constants for outputting GenBank information 146 GB_LINE_LENGTH = 79 147 GB_BASE_INDENT = 12 148 GB_FEATURE_INDENT = 21 149 GB_INTERNAL_INDENT = 2 150 GB_OTHER_INTERNAL_INDENT = 3 151 GB_FEATURE_INTERNAL_INDENT = 5 152 GB_SEQUENCE_INDENT = 9 153 154 BASE_FORMAT = "%-" + str(GB_BASE_INDENT) + "s" 155 INTERNAL_FORMAT = " " * GB_INTERNAL_INDENT + "%-" + \ 156 str(GB_BASE_INDENT - GB_INTERNAL_INDENT) + "s" 157 OTHER_INTERNAL_FORMAT = " " * GB_OTHER_INTERNAL_INDENT + "%-" + \ 158 str(GB_BASE_INDENT - GB_OTHER_INTERNAL_INDENT) + \ 159 "s" 160 161 BASE_FEATURE_FORMAT = "%-" + str(GB_FEATURE_INDENT) + "s" 162 INTERNAL_FEATURE_FORMAT = " " * GB_FEATURE_INTERNAL_INDENT + "%-" + \ 163 str(GB_FEATURE_INDENT - 164 GB_FEATURE_INTERNAL_INDENT) + "s" 165 SEQUENCE_FORMAT = "%" + str(GB_SEQUENCE_INDENT) + "s" 166
167 - def __init__(self):
168 self.locus = '' 169 self.size = '' 170 self.residue_type = '' 171 self.data_file_division = '' 172 self.date = '' 173 self.definition = '' 174 self.accession = [] 175 self.nid = '' 176 self.pid = '' 177 self.version = '' 178 self.projects = [] 179 self.dblinks = [] 180 self.db_source = '' 181 self.gi = '' 182 self.keywords = [] 183 self.segment = '' 184 self.source = '' 185 self.organism = '' 186 self.taxonomy = [] 187 self.references = [] 188 self.comment = '' 189 self.features = [] 190 self.base_counts = '' 191 self.origin = '' 192 self.sequence = '' 193 self.contig = '' 194 self.primary = [] 195 self.wgs = '' 196 self.wgs_scafld = []
197
198 - def __str__(self):
199 """Provide a GenBank formatted output option for a Record. 200 201 The objective of this is to provide an easy way to read in a GenBank 202 record, modify it somehow, and then output it in 'GenBank format.' 203 We are striving to make this work so that a parsed Record that is 204 output using this function will look exactly like the original 205 record. 206 207 Much of the output is based on format description info at: 208 209 ftp://ncbi.nlm.nih.gov/genbank/gbrel.txt 210 """ 211 output = self._locus_line() 212 output += self._definition_line() 213 output += self._accession_line() 214 output += self._version_line() 215 output += self._project_line() 216 output += self._dblink_line() 217 output += self._nid_line() 218 output += self._pid_line() 219 output += self._keywords_line() 220 output += self._db_source_line() 221 output += self._segment_line() 222 output += self._source_line() 223 output += self._organism_line() 224 for reference in self.references: 225 output += str(reference) 226 output += self._comment_line() 227 output += self._features_line() 228 for feature in self.features: 229 output += str(feature) 230 output += self._base_count_line() 231 output += self._origin_line() 232 output += self._sequence_line() 233 output += self._wgs_line() 234 output += self._wgs_scafld_line() 235 output += self._contig_line() 236 output += "//" 237 return output
238
239 - def _locus_line(self):
240 """Provide the output string for the LOCUS line. 241 """ 242 output = "LOCUS" 243 output += " " * 7 # 6-12 spaces 244 output += "%-9s" % self.locus 245 output += " " # 22 space 246 output += "%7s" % self.size 247 if "PROTEIN" in self.residue_type: 248 output += " aa" 249 else: 250 output += " bp " 251 252 # treat circular types differently, since they'll have long residue 253 # types 254 if "circular" in self.residue_type: 255 output += "%17s" % self.residue_type 256 # second case: ss-DNA types of records 257 elif "-" in self.residue_type: 258 output += "%7s" % self.residue_type 259 output += " " * 10 # spaces for circular 260 else: 261 output += " " * 3 # spaces for stuff like ss- 262 output += "%-4s" % self.residue_type 263 output += " " * 10 # spaces for circular 264 265 output += " " * 2 266 output += "%3s" % self.data_file_division 267 output += " " * 7 # spaces for 56-63 268 output += "%11s" % self.date 269 output += "\n" 270 return output
271
272 - def _definition_line(self):
273 """Provide output for the DEFINITION line. 274 """ 275 output = Record.BASE_FORMAT % "DEFINITION" 276 output += _wrapped_genbank(self.definition, Record.GB_BASE_INDENT) 277 return output
278
279 - def _accession_line(self):
280 """Output for the ACCESSION line. 281 """ 282 if self.accession: 283 output = Record.BASE_FORMAT % "ACCESSION" 284 285 acc_info = "" 286 for accession in self.accession: 287 acc_info += "%s " % accession 288 # strip off an extra space at the end 289 acc_info = acc_info.rstrip() 290 output += _wrapped_genbank(acc_info, Record.GB_BASE_INDENT) 291 else: 292 output = "" 293 294 return output
295
296 - def _version_line(self):
297 """Output for the VERSION line. 298 """ 299 if self.version: 300 output = Record.BASE_FORMAT % "VERSION" 301 output += self.version 302 output += " GI:" 303 output += "%s\n" % self.gi 304 else: 305 output = "" 306 return output
307
308 - def _project_line(self):
309 output = "" 310 if len(self.projects) > 0: 311 output = Record.BASE_FORMAT % "PROJECT" 312 output += "%s\n" % " ".join(self.projects) 313 return output
314 322
323 - def _nid_line(self):
324 """Output for the NID line. Use of NID is obsolete in GenBank files. 325 """ 326 if self.nid: 327 output = Record.BASE_FORMAT % "NID" 328 output += "%s\n" % self.nid 329 else: 330 output = "" 331 return output
332
333 - def _pid_line(self):
334 """Output for PID line. Presumedly, PID usage is also obsolete. 335 """ 336 if self.pid: 337 output = Record.BASE_FORMAT % "PID" 338 output += "%s\n" % self.pid 339 else: 340 output = "" 341 return output
342
343 - def _keywords_line(self):
344 """Output for the KEYWORDS line. 345 """ 346 output = "" 347 if len(self.keywords) >= 0: 348 output += Record.BASE_FORMAT % "KEYWORDS" 349 keyword_info = "" 350 for keyword in self.keywords: 351 keyword_info += "%s; " % keyword 352 # replace the ; at the end with a period 353 keyword_info = keyword_info[:-2] 354 keyword_info += "." 355 356 output += _wrapped_genbank(keyword_info, 357 Record.GB_BASE_INDENT) 358 359 return output
360
361 - def _db_source_line(self):
362 """Output for DBSOURCE line. 363 """ 364 if self.db_source: 365 output = Record.BASE_FORMAT % "DBSOURCE" 366 output += "%s\n" % self.db_source 367 else: 368 output = "" 369 return output
370
371 - def _segment_line(self):
372 """Output for the SEGMENT line. 373 """ 374 output = "" 375 if self.segment: 376 output += Record.BASE_FORMAT % "SEGMENT" 377 output += _wrapped_genbank(self.segment, Record.GB_BASE_INDENT) 378 return output
379
380 - def _source_line(self):
381 """Output for SOURCE line on where the sample came from. 382 """ 383 output = Record.BASE_FORMAT % "SOURCE" 384 output += _wrapped_genbank(self.source, Record.GB_BASE_INDENT) 385 return output
386
387 - def _organism_line(self):
388 """Output for ORGANISM line with taxonomy info. 389 """ 390 output = Record.INTERNAL_FORMAT % "ORGANISM" 391 # Now that species names can be too long, this line can wrap (Bug 2591) 392 output += _wrapped_genbank(self.organism, Record.GB_BASE_INDENT) 393 output += " " * Record.GB_BASE_INDENT 394 taxonomy_info = "" 395 for tax in self.taxonomy: 396 taxonomy_info += "%s; " % tax 397 # replace the ; at the end with a period 398 taxonomy_info = taxonomy_info[:-2] 399 taxonomy_info += "." 400 output += _wrapped_genbank(taxonomy_info, Record.GB_BASE_INDENT) 401 402 return output
403
404 - def _comment_line(self):
405 """Output for the COMMENT lines. 406 """ 407 output = "" 408 if self.comment: 409 output += Record.BASE_FORMAT % "COMMENT" 410 output += _indent_genbank(self.comment, 411 Record.GB_BASE_INDENT) 412 return output
413
414 - def _features_line(self):
415 """Output for the FEATURES line. 416 """ 417 output = "" 418 if len(self.features) > 0: 419 output += Record.BASE_FEATURE_FORMAT % "FEATURES" 420 output += "Location/Qualifiers\n" 421 return output
422
423 - def _base_count_line(self):
424 """Output for the BASE COUNT line with base information. 425 """ 426 output = "" 427 if self.base_counts: 428 output += Record.BASE_FORMAT % "BASE COUNT " 429 # split up the base counts into their individual parts 430 count_parts = self.base_counts.split(" ") 431 while '' in count_parts: 432 count_parts.remove('') 433 # deal with the standard case, with a normal origin line 434 # like: 474 a 356 c 428 g 364 t 435 if len(count_parts) % 2 == 0: 436 while len(count_parts) > 0: 437 count_info = count_parts.pop(0) 438 count_type = count_parts.pop(0) 439 440 output += "%7s %s" % (count_info, count_type) 441 # deal with ugly ORIGIN lines like: 442 # 1311257 a2224835 c2190093 g1309889 t 443 # by just outputting the raw information 444 else: 445 output += self.base_counts 446 output += "\n" 447 return output
448
449 - def _origin_line(self):
450 """Output for the ORIGIN line 451 """ 452 output = "" 453 # only output the ORIGIN line if we have a sequence 454 if self.sequence: 455 output += Record.BASE_FORMAT % "ORIGIN" 456 if self.origin: 457 output += _wrapped_genbank(self.origin, 458 Record.GB_BASE_INDENT) 459 else: 460 output += "\n" 461 return output
462
463 - def _sequence_line(self):
464 """Output for all of the sequence. 465 """ 466 output = "" 467 if self.sequence: 468 cur_seq_pos = 0 469 while cur_seq_pos < len(self.sequence): 470 output += Record.SEQUENCE_FORMAT % str(cur_seq_pos + 1) 471 472 for section in range(6): 473 start_pos = cur_seq_pos + section * 10 474 end_pos = start_pos + 10 475 seq_section = self.sequence[start_pos:end_pos] 476 output += " %s" % seq_section.lower() 477 478 # stop looping if we are out of sequence 479 if end_pos > len(self.sequence): 480 break 481 482 output += "\n" 483 cur_seq_pos += 60 484 return output
485
486 - def _wgs_line(self):
487 output = "" 488 if self.wgs: 489 output += Record.BASE_FORMAT % "WGS" 490 output += self.wgs 491 return output
492
493 - def _wgs_scafld_line(self):
494 output = "" 495 if self.wgs_scafld: 496 output += Record.BASE_FORMAT % "WGS_SCAFLD" 497 output += self.wgs_scafld 498 return output
499
500 - def _contig_line(self):
501 """Output for CONTIG location information from RefSeq. 502 """ 503 output = "" 504 if self.contig: 505 output += Record.BASE_FORMAT % "CONTIG" 506 output += _wrapped_genbank(self.contig, 507 Record.GB_BASE_INDENT, split_char=',') 508 return output
509 510
511 -class Reference(object):
512 """Hold information from a GenBank reference. 513 514 Attributes: 515 516 - number - The number of the reference in the listing of references. 517 - bases - The bases in the sequence the reference refers to. 518 - authors - String with all of the authors. 519 - consrtm - Consortium the authors belong to. 520 - title - The title of the reference. 521 - journal - Information about the journal where the reference appeared. 522 - medline_id - The medline id for the reference. 523 - pubmed_id - The pubmed_id for the reference. 524 - remark - Free-form remarks about the reference. 525 """
526 - def __init__(self):
527 self.number = '' 528 self.bases = '' 529 self.authors = '' 530 self.consrtm = '' 531 self.title = '' 532 self.journal = '' 533 self.medline_id = '' 534 self.pubmed_id = '' 535 self.remark = ''
536
537 - def __str__(self):
538 output = self._reference_line() 539 output += self._authors_line() 540 output += self._consrtm_line() 541 output += self._title_line() 542 output += self._journal_line() 543 output += self._medline_line() 544 output += self._pubmed_line() 545 output += self._remark_line() 546 547 return output
548
549 - def _reference_line(self):
550 """Output for REFERENCE lines. 551 """ 552 output = Record.BASE_FORMAT % "REFERENCE" 553 if self.number: 554 if self.bases: 555 output += "%-3s" % self.number 556 output += "%s" % self.bases 557 else: 558 output += "%s" % self.number 559 560 output += "\n" 561 return output
562
563 - def _authors_line(self):
564 """Output for AUTHORS information. 565 """ 566 output = "" 567 if self.authors: 568 output += Record.INTERNAL_FORMAT % "AUTHORS" 569 output += _wrapped_genbank(self.authors, Record.GB_BASE_INDENT) 570 return output
571
572 - def _consrtm_line(self):
573 """Output for CONSRTM information. 574 """ 575 output = "" 576 if self.consrtm: 577 output += Record.INTERNAL_FORMAT % "CONSRTM" 578 output += _wrapped_genbank(self.consrtm, Record.GB_BASE_INDENT) 579 return output
580
581 - def _title_line(self):
582 """Output for TITLE information. 583 """ 584 output = "" 585 if self.title: 586 output += Record.INTERNAL_FORMAT % "TITLE" 587 output += _wrapped_genbank(self.title, Record.GB_BASE_INDENT) 588 return output
589
590 - def _journal_line(self):
591 """Output for JOURNAL information. 592 """ 593 output = "" 594 if self.journal: 595 output += Record.INTERNAL_FORMAT % "JOURNAL" 596 output += _wrapped_genbank(self.journal, Record.GB_BASE_INDENT) 597 return output
598
599 - def _medline_line(self):
600 """Output for MEDLINE information. 601 """ 602 output = "" 603 if self.medline_id: 604 output += Record.INTERNAL_FORMAT % "MEDLINE" 605 output += self.medline_id + "\n" 606 return output
607
608 - def _pubmed_line(self):
609 """Output for PUBMED information. 610 """ 611 output = "" 612 if self.pubmed_id: 613 output += Record.OTHER_INTERNAL_FORMAT % "PUBMED" 614 output += self.pubmed_id + "\n" 615 return output
616
617 - def _remark_line(self):
618 """Output for REMARK information. 619 """ 620 output = "" 621 if self.remark: 622 output += Record.INTERNAL_FORMAT % "REMARK" 623 output += _wrapped_genbank(self.remark, Record.GB_BASE_INDENT) 624 return output
625 626
627 -class Feature(object):
628 """Hold information about a Feature in the Feature Table of GenBank record. 629 630 Attributes: 631 632 - key - The key name of the featue (ie. source) 633 - location - The string specifying the location of the feature. 634 - qualfiers - A listing Qualifier objects in the feature. 635 """
636 - def __init__(self):
637 self.key = '' 638 self.location = '' 639 self.qualifiers = []
640
641 - def __str__(self):
642 output = Record.INTERNAL_FEATURE_FORMAT % self.key 643 output += _wrapped_genbank(self.location, Record.GB_FEATURE_INDENT, 644 split_char=',') 645 for qualifier in self.qualifiers: 646 output += " " * Record.GB_FEATURE_INDENT 647 648 # determine whether we can wrap on spaces 649 space_wrap = 1 650 for no_space_key in \ 651 Bio.GenBank._BaseGenBankConsumer.remove_space_keys: 652 if no_space_key in qualifier.key: 653 space_wrap = 0 654 655 output += _wrapped_genbank(qualifier.key + qualifier.value, 656 Record.GB_FEATURE_INDENT, space_wrap) 657 return output
658 659
660 -class Qualifier(object):
661 """Hold information about a qualifier in a GenBank feature. 662 663 Attributes: 664 665 - key - The key name of the qualifier (ie. /organism=) 666 - value - The value of the qualifier ("Dictyostelium discoideum"). 667 """
668 - def __init__(self):
669 self.key = '' 670 self.value = ''
671