Package Bio :: Package GenBank :: Module Record
[hide private]
[frames] | no frames]

Source Code for Module Bio.GenBank.Record

  1  # This code is part of the Biopython distribution and governed by its 
  2  # license.  Please see the LICENSE file that should have been included 
  3  # as part of this package. 
  4  # 
  5   
  6  """Hold GenBank data in a straightforward format. 
  7   
  8  Classes: 
  9   - Record - All of the information in a GenBank record. 
 10   - Reference - hold reference data for a record. 
 11   - Feature - Hold the information in a Feature Table. 
 12   - Qualifier - Qualifiers on a Feature. 
 13   
 14  17-MAR-2009: added support for WGS and WGS_SCAFLD lines.  Ying Huang & Iddo Friedberg 
 15  """ 
 16  # local stuff 
 17  import Bio.GenBank 
 18   
 19   
20 -def _wrapped_genbank(information, indent, wrap_space=1, split_char=" "):
21 """Write a line of GenBank info that can wrap over multiple lines. 22 23 This takes a line of information which can potentially wrap over 24 multiple lines, and breaks it up with carriage returns and 25 indentation so it fits properly into a GenBank record. 26 27 Arguments: 28 - information - The string holding the information we want 29 wrapped in GenBank method. 30 - indent - The indentation on the lines we are writing. 31 - wrap_space - Whether or not to wrap only on spaces in the 32 information. 33 - split_char - A specific character to split the lines on. By default 34 spaces are used. 35 36 """ 37 info_length = Record.GB_LINE_LENGTH - indent 38 39 if not information: 40 # GenBank files use "." for missing data 41 return ".\n" 42 43 if wrap_space: 44 info_parts = information.split(split_char) 45 else: 46 cur_pos = 0 47 info_parts = [] 48 while cur_pos < len(information): 49 info_parts.append(information[cur_pos: cur_pos + info_length]) 50 cur_pos += info_length 51 52 # first get the information string split up by line 53 output_parts = [] 54 cur_part = "" 55 for info_part in info_parts: 56 if len(cur_part) + 1 + len(info_part) > info_length: 57 if cur_part: 58 if split_char != " ": 59 cur_part += split_char 60 output_parts.append(cur_part) 61 cur_part = info_part 62 else: 63 if cur_part == "": 64 cur_part = info_part 65 else: 66 cur_part += split_char + info_part 67 68 # add the last bit of information to the output 69 if cur_part: 70 output_parts.append(cur_part) 71 72 # now format the information string for return 73 output_info = output_parts[0] + "\n" 74 for output_part in output_parts[1:]: 75 output_info += " " * indent + output_part + "\n" 76 77 return output_info
78 79
80 -def _indent_genbank(information, indent):
81 """Write out information with the specified indent. 82 83 Unlike _wrapped_genbank, this function makes no attempt to wrap 84 lines -- it assumes that the information already has newlines in the 85 appropriate places, and will add the specified indent to the start of 86 each line. 87 """ 88 # split the info into lines based on line breaks 89 info_parts = information.split("\n") 90 91 # the first line will have no indent 92 output_info = info_parts[0] + "\n" 93 for info_part in info_parts[1:]: 94 output_info += " " * indent + info_part + "\n" 95 96 return output_info
97 98
99 -class Record(object):
100 """Hold GenBank information in a format similar to the original record. 101 102 The Record class is meant to make data easy to get to when you are 103 just interested in looking at GenBank data. 104 105 Attributes: 106 - locus - The name specified after the LOCUS keyword in the GenBank 107 record. This may be the accession number, or a clone id or something else. 108 - size - The size of the record. 109 - residue_type - The type of residues making up the sequence in this 110 record. Normally something like RNA, DNA or PROTEIN, but may be as 111 esoteric as 'ss-RNA circular'. 112 - data_file_division - The division this record is stored under in 113 GenBank (ie. PLN -> plants; PRI -> humans, primates; BCT -> bacteria...) 114 - date - The date of submission of the record, in a form like '28-JUL-1998' 115 - accession - list of all accession numbers for the sequence. 116 - nid - Nucleotide identifier number. 117 - pid - Proteint identifier number 118 - version - The accession number + version (ie. AB01234.2) 119 - db_source - Information about the database the record came from 120 - gi - The NCBI gi identifier for the record. 121 - keywords - A list of keywords related to the record. 122 - segment - If the record is one of a series, this is info about which 123 segment this record is (something like '1 of 6'). 124 - source - The source of material where the sequence came from. 125 - organism - The genus and species of the organism (ie. 'Homo sapiens') 126 - taxonomy - A listing of the taxonomic classification of the organism, 127 starting general and getting more specific. 128 - references - A list of Reference objects. 129 - comment - Text with any kind of comment about the record. 130 - features - A listing of Features making up the feature table. 131 - base_counts - A string with the counts of bases for the sequence. 132 - origin - A string specifying info about the origin of the sequence. 133 - sequence - A string with the sequence itself. 134 - contig - A string of location information for a CONTIG in a RefSeq file 135 - project - The genome sequencing project numbers 136 (will be replaced by the dblink cross-references in 2009). 137 - dblinks - The genome sequencing project number(s) and other links. 138 (will replace the project information in 2009). 139 140 """ 141 142 # constants for outputting GenBank information 143 GB_LINE_LENGTH = 79 144 GB_BASE_INDENT = 12 145 GB_FEATURE_INDENT = 21 146 GB_INTERNAL_INDENT = 2 147 GB_OTHER_INTERNAL_INDENT = 3 148 GB_FEATURE_INTERNAL_INDENT = 5 149 GB_SEQUENCE_INDENT = 9 150 151 BASE_FORMAT = "%-" + str(GB_BASE_INDENT) + "s" 152 INTERNAL_FORMAT = " " * GB_INTERNAL_INDENT + "%-" + \ 153 str(GB_BASE_INDENT - GB_INTERNAL_INDENT) + "s" 154 OTHER_INTERNAL_FORMAT = " " * GB_OTHER_INTERNAL_INDENT + "%-" + \ 155 str(GB_BASE_INDENT - GB_OTHER_INTERNAL_INDENT) + \ 156 "s" 157 158 BASE_FEATURE_FORMAT = "%-" + str(GB_FEATURE_INDENT) + "s" 159 INTERNAL_FEATURE_FORMAT = " " * GB_FEATURE_INTERNAL_INDENT + "%-" + \ 160 str(GB_FEATURE_INDENT - 161 GB_FEATURE_INTERNAL_INDENT) + "s" 162 SEQUENCE_FORMAT = "%" + str(GB_SEQUENCE_INDENT) + "s" 163
164 - def __init__(self):
165 """Initialize.""" 166 self.accession = [] 167 self.base_counts = '' 168 self.comment = '' 169 self.contig = '' 170 self.data_file_division = '' 171 self.date = '' 172 self.db_source = '' 173 self.dblinks = [] 174 self.definition = '' 175 self.features = [] 176 self.gi = '' 177 self.keywords = [] 178 self.locus = '' 179 self.nid = '' 180 self.organism = '' 181 self.origin = '' 182 self.pid = '' 183 self.primary = [] 184 self.projects = [] 185 self.references = [] 186 self.residue_type = '' 187 self.segment = '' 188 self.sequence = '' 189 self.size = '' 190 self.source = '' 191 self.taxonomy = [] 192 self.version = '' 193 self.wgs = '' 194 self.wgs_scafld = []
195
196 - def __str__(self):
197 """Provide a GenBank formatted output option for a Record. 198 199 The objective of this is to provide an easy way to read in a GenBank 200 record, modify it somehow, and then output it in 'GenBank format.' 201 We are striving to make this work so that a parsed Record that is 202 output using this function will look exactly like the original 203 record. 204 205 Much of the output is based on format description info at: 206 207 ftp://ncbi.nlm.nih.gov/genbank/gbrel.txt 208 """ 209 output = self._locus_line() 210 output += self._definition_line() 211 output += self._accession_line() 212 output += self._version_line() 213 output += self._project_line() 214 output += self._dblink_line() 215 output += self._nid_line() 216 output += self._pid_line() 217 output += self._keywords_line() 218 output += self._db_source_line() 219 output += self._segment_line() 220 output += self._source_line() 221 output += self._organism_line() 222 for reference in self.references: 223 output += str(reference) 224 output += self._comment_line() 225 output += self._features_line() 226 for feature in self.features: 227 output += str(feature) 228 output += self._base_count_line() 229 output += self._origin_line() 230 output += self._sequence_line() 231 output += self._wgs_line() 232 output += self._wgs_scafld_line() 233 output += self._contig_line() 234 output += "//" 235 return output
236
237 - def _locus_line(self):
238 """Provide the output string for the LOCUS line.""" 239 output = "LOCUS" 240 output += " " * 7 # 6-12 spaces 241 output += "%-9s" % self.locus 242 output += " " # 22 space 243 output += "%7s" % self.size 244 if "PROTEIN" in self.residue_type: 245 output += " aa" 246 else: 247 output += " bp " 248 249 # treat circular types differently, since they'll have long residue 250 # types 251 if "circular" in self.residue_type: 252 output += "%17s" % self.residue_type 253 # second case: ss-DNA types of records 254 elif "-" in self.residue_type: 255 output += "%7s" % self.residue_type 256 output += " " * 10 # spaces for circular 257 else: 258 output += " " * 3 # spaces for stuff like ss- 259 output += "%-4s" % self.residue_type 260 output += " " * 10 # spaces for circular 261 262 output += " " * 2 263 output += "%3s" % self.data_file_division 264 output += " " * 7 # spaces for 56-63 265 output += "%11s" % self.date 266 output += "\n" 267 return output
268
269 - def _definition_line(self):
270 """Provide output for the DEFINITION line.""" 271 output = Record.BASE_FORMAT % "DEFINITION" 272 output += _wrapped_genbank(self.definition + ".", Record.GB_BASE_INDENT) 273 return output
274
275 - def _accession_line(self):
276 """Output for the ACCESSION line.""" 277 if self.accession: 278 output = Record.BASE_FORMAT % "ACCESSION" 279 280 acc_info = "" 281 for accession in self.accession: 282 acc_info += "%s " % accession 283 # strip off an extra space at the end 284 acc_info = acc_info.rstrip() 285 output += _wrapped_genbank(acc_info, Record.GB_BASE_INDENT) 286 else: 287 output = "" 288 289 return output
290
291 - def _version_line(self):
292 """Output for the VERSION line.""" 293 if self.version: 294 output = Record.BASE_FORMAT % "VERSION" 295 output += self.version 296 output += " GI:" 297 output += "%s\n" % self.gi 298 else: 299 output = "" 300 return output
301
302 - def _project_line(self):
303 output = "" 304 if len(self.projects) > 0: 305 output = Record.BASE_FORMAT % "PROJECT" 306 output += "%s\n" % " ".join(self.projects) 307 return output
308 316
317 - def _nid_line(self):
318 """Output for the NID line. Use of NID is obsolete in GenBank files.""" 319 if self.nid: 320 output = Record.BASE_FORMAT % "NID" 321 output += "%s\n" % self.nid 322 else: 323 output = "" 324 return output
325
326 - def _pid_line(self):
327 """Output for PID line. Presumedly, PID usage is also obsolete.""" 328 if self.pid: 329 output = Record.BASE_FORMAT % "PID" 330 output += "%s\n" % self.pid 331 else: 332 output = "" 333 return output
334
335 - def _keywords_line(self):
336 """Output for the KEYWORDS line.""" 337 output = "" 338 if len(self.keywords) >= 0: 339 output += Record.BASE_FORMAT % "KEYWORDS" 340 keyword_info = "" 341 for keyword in self.keywords: 342 keyword_info += "%s; " % keyword 343 # replace the ; at the end with a period 344 keyword_info = keyword_info[:-2] 345 keyword_info += "." 346 347 output += _wrapped_genbank(keyword_info, 348 Record.GB_BASE_INDENT) 349 350 return output
351
352 - def _db_source_line(self):
353 """Output for DBSOURCE line.""" 354 if self.db_source: 355 output = Record.BASE_FORMAT % "DBSOURCE" 356 output += "%s\n" % self.db_source 357 else: 358 output = "" 359 return output
360
361 - def _segment_line(self):
362 """Output for the SEGMENT line.""" 363 output = "" 364 if self.segment: 365 output += Record.BASE_FORMAT % "SEGMENT" 366 output += _wrapped_genbank(self.segment, Record.GB_BASE_INDENT) 367 return output
368
369 - def _source_line(self):
370 """Output for SOURCE line on where the sample came from.""" 371 output = Record.BASE_FORMAT % "SOURCE" 372 output += _wrapped_genbank(self.source, Record.GB_BASE_INDENT) 373 return output
374
375 - def _organism_line(self):
376 """Output for ORGANISM line with taxonomy info.""" 377 output = Record.INTERNAL_FORMAT % "ORGANISM" 378 # Now that species names can be too long, this line can wrap (Bug 2591) 379 output += _wrapped_genbank(self.organism, Record.GB_BASE_INDENT) 380 output += " " * Record.GB_BASE_INDENT 381 taxonomy_info = "" 382 for tax in self.taxonomy: 383 taxonomy_info += "%s; " % tax 384 # replace the ; at the end with a period 385 taxonomy_info = taxonomy_info[:-2] 386 taxonomy_info += "." 387 output += _wrapped_genbank(taxonomy_info, Record.GB_BASE_INDENT) 388 389 return output
390
391 - def _comment_line(self):
392 """Output for the COMMENT lines.""" 393 output = "" 394 if self.comment: 395 output += Record.BASE_FORMAT % "COMMENT" 396 output += _indent_genbank(self.comment, 397 Record.GB_BASE_INDENT) 398 return output
399
400 - def _features_line(self):
401 """Output for the FEATURES line.""" 402 output = "" 403 if len(self.features) > 0: 404 output += Record.BASE_FEATURE_FORMAT % "FEATURES" 405 output += "Location/Qualifiers\n" 406 return output
407
408 - def _base_count_line(self):
409 """Output for the BASE COUNT line with base information.""" 410 output = "" 411 if self.base_counts: 412 output += Record.BASE_FORMAT % "BASE COUNT " 413 # split up the base counts into their individual parts 414 count_parts = self.base_counts.split(" ") 415 while '' in count_parts: 416 count_parts.remove('') 417 # deal with the standard case, with a normal origin line 418 # like: 474 a 356 c 428 g 364 t 419 if len(count_parts) % 2 == 0: 420 while len(count_parts) > 0: 421 count_info = count_parts.pop(0) 422 count_type = count_parts.pop(0) 423 424 output += "%7s %s" % (count_info, count_type) 425 # deal with ugly ORIGIN lines like: 426 # 1311257 a2224835 c2190093 g1309889 t 427 # by just outputting the raw information 428 else: 429 output += self.base_counts 430 output += "\n" 431 return output
432
433 - def _origin_line(self):
434 """Output for the ORIGIN line.""" 435 output = "" 436 # only output the ORIGIN line if we have a sequence 437 if self.sequence: 438 output += Record.BASE_FORMAT % "ORIGIN" 439 if self.origin: 440 output += _wrapped_genbank(self.origin, 441 Record.GB_BASE_INDENT) 442 else: 443 output += "\n" 444 return output
445
446 - def _sequence_line(self):
447 """Output for all of the sequence.""" 448 output = "" 449 if self.sequence: 450 cur_seq_pos = 0 451 while cur_seq_pos < len(self.sequence): 452 output += Record.SEQUENCE_FORMAT % str(cur_seq_pos + 1) 453 454 for section in range(6): 455 start_pos = cur_seq_pos + section * 10 456 end_pos = start_pos + 10 457 seq_section = self.sequence[start_pos:end_pos] 458 output += " %s" % seq_section.lower() 459 460 # stop looping if we are out of sequence 461 if end_pos > len(self.sequence): 462 break 463 464 output += "\n" 465 cur_seq_pos += 60 466 return output
467
468 - def _wgs_line(self):
469 output = "" 470 if self.wgs: 471 output += Record.BASE_FORMAT % "WGS" 472 output += self.wgs 473 return output
474
475 - def _wgs_scafld_line(self):
476 output = "" 477 if self.wgs_scafld: 478 output += Record.BASE_FORMAT % "WGS_SCAFLD" 479 output += self.wgs_scafld 480 return output
481
482 - def _contig_line(self):
483 """Output for CONTIG location information from RefSeq.""" 484 output = "" 485 if self.contig: 486 output += Record.BASE_FORMAT % "CONTIG" 487 output += _wrapped_genbank(self.contig, 488 Record.GB_BASE_INDENT, split_char=',') 489 return output
490 491
492 -class Reference(object):
493 """Hold information from a GenBank reference. 494 495 Attributes: 496 - number - The number of the reference in the listing of references. 497 - bases - The bases in the sequence the reference refers to. 498 - authors - String with all of the authors. 499 - consrtm - Consortium the authors belong to. 500 - title - The title of the reference. 501 - journal - Information about the journal where the reference appeared. 502 - medline_id - The medline id for the reference. 503 - pubmed_id - The pubmed_id for the reference. 504 - remark - Free-form remarks about the reference. 505 506 """ 507
508 - def __init__(self):
509 """Initialize.""" 510 self.number = '' 511 self.bases = '' 512 self.authors = '' 513 self.consrtm = '' 514 self.title = '' 515 self.journal = '' 516 self.medline_id = '' 517 self.pubmed_id = '' 518 self.remark = ''
519
520 - def __str__(self):
521 """Convert the reference to a GenBank format string.""" 522 output = self._reference_line() 523 output += self._authors_line() 524 output += self._consrtm_line() 525 output += self._title_line() 526 output += self._journal_line() 527 output += self._medline_line() 528 output += self._pubmed_line() 529 output += self._remark_line() 530 531 return output
532
533 - def _reference_line(self):
534 """Output for REFERENCE lines.""" 535 output = Record.BASE_FORMAT % "REFERENCE" 536 if self.number: 537 if self.bases: 538 output += "%-3s" % self.number 539 output += "%s" % self.bases 540 else: 541 output += "%s" % self.number 542 543 output += "\n" 544 return output
545
546 - def _authors_line(self):
547 """Output for AUTHORS information.""" 548 output = "" 549 if self.authors: 550 output += Record.INTERNAL_FORMAT % "AUTHORS" 551 output += _wrapped_genbank(self.authors, Record.GB_BASE_INDENT) 552 return output
553
554 - def _consrtm_line(self):
555 """Output for CONSRTM information.""" 556 output = "" 557 if self.consrtm: 558 output += Record.INTERNAL_FORMAT % "CONSRTM" 559 output += _wrapped_genbank(self.consrtm, Record.GB_BASE_INDENT) 560 return output
561
562 - def _title_line(self):
563 """Output for TITLE information.""" 564 output = "" 565 if self.title: 566 output += Record.INTERNAL_FORMAT % "TITLE" 567 output += _wrapped_genbank(self.title, Record.GB_BASE_INDENT) 568 return output
569
570 - def _journal_line(self):
571 """Output for JOURNAL information.""" 572 output = "" 573 if self.journal: 574 output += Record.INTERNAL_FORMAT % "JOURNAL" 575 output += _wrapped_genbank(self.journal, Record.GB_BASE_INDENT) 576 return output
577
578 - def _medline_line(self):
579 """Output for MEDLINE information.""" 580 output = "" 581 if self.medline_id: 582 output += Record.INTERNAL_FORMAT % "MEDLINE" 583 output += self.medline_id + "\n" 584 return output
585
586 - def _pubmed_line(self):
587 """Output for PUBMED information.""" 588 output = "" 589 if self.pubmed_id: 590 output += Record.OTHER_INTERNAL_FORMAT % "PUBMED" 591 output += self.pubmed_id + "\n" 592 return output
593
594 - def _remark_line(self):
595 """Output for REMARK information.""" 596 output = "" 597 if self.remark: 598 output += Record.INTERNAL_FORMAT % "REMARK" 599 output += _wrapped_genbank(self.remark, Record.GB_BASE_INDENT) 600 return output
601 602
603 -class Feature(object):
604 """Hold information about a Feature in the Feature Table of GenBank record. 605 606 Attributes: 607 - key - The key name of the featue (ie. source) 608 - location - The string specifying the location of the feature. 609 - qualfiers - A listing Qualifier objects in the feature. 610 611 """ 612
613 - def __init__(self):
614 """Initialize.""" 615 self.key = '' 616 self.location = '' 617 self.qualifiers = []
618
619 - def __str__(self):
620 """Return feature as a GenBank format string.""" 621 output = Record.INTERNAL_FEATURE_FORMAT % self.key 622 output += _wrapped_genbank(self.location, Record.GB_FEATURE_INDENT, 623 split_char=',') 624 for qualifier in self.qualifiers: 625 output += " " * Record.GB_FEATURE_INDENT 626 627 # determine whether we can wrap on spaces 628 space_wrap = 1 629 for no_space_key in \ 630 Bio.GenBank._BaseGenBankConsumer.remove_space_keys: 631 if no_space_key in qualifier.key: 632 space_wrap = 0 633 634 output += _wrapped_genbank(qualifier.key + qualifier.value, 635 Record.GB_FEATURE_INDENT, space_wrap) 636 return output
637 638
639 -class Qualifier(object):
640 """Hold information about a qualifier in a GenBank feature. 641 642 Attributes: 643 - key - The key name of the qualifier (ie. /organism=) 644 - value - The value of the qualifier ("Dictyostelium discoideum"). 645 646 """ 647
648 - def __init__(self):
649 """Initialize.""" 650 self.key = '' 651 self.value = ''
652