Package Bio :: Module SeqRecord
[hide private]
[frames] | no frames]

Source Code for Module Bio.SeqRecord

   1  # Copyright 2000-2002 Andrew Dalke.  All rights reserved. 
   2  # Copyright 2002-2004 Brad Chapman.  All rights reserved. 
   3  # Copyright 2006-2017 by Peter Cock.  All rights reserved. 
   4  # 
   5  # This file is part of the Biopython distribution and governed by your 
   6  # choice of the "Biopython License Agreement" or the "BSD 3-Clause License". 
   7  # Please see the LICENSE file that should have been included as part of this 
   8  # package. 
   9  """Represent a Sequence Record, a sequence with annotation.""" 
  10   
  11   
  12  from Bio._py3k import basestring 
  13   
  14  # NEEDS TO BE SYNCH WITH THE REST OF BIOPYTHON AND BIOPERL 
  15  # In particular, the SeqRecord and BioSQL.BioSeq.DBSeqRecord classes 
  16  # need to be in sync (this is the BioSQL "Database SeqRecord", see 
  17  # also BioSQL.BioSeq.DBSeq which is the "Database Seq" class) 
  18   
  19  _NO_SEQRECORD_COMPARISON = "SeqRecord comparison is deliberately not implemented. Explicitly compare the attributes of interest." 
  20   
  21   
22 -class _RestrictedDict(dict):
23 """Dict which only allows sequences of given length as values (PRIVATE). 24 25 This simple subclass of the Python dictionary is used in the SeqRecord 26 object for holding per-letter-annotations. This class is intended to 27 prevent simple errors by only allowing python sequences (e.g. lists, 28 strings and tuples) to be stored, and only if their length matches that 29 expected (the length of the SeqRecord's seq object). It cannot however 30 prevent the entries being edited in situ (for example appending entries 31 to a list). 32 33 >>> x = _RestrictedDict(5) 34 >>> x["test"] = "hello" 35 >>> x 36 {'test': 'hello'} 37 38 Adding entries which don't have the expected length are blocked: 39 40 >>> x["test"] = "hello world" 41 Traceback (most recent call last): 42 ... 43 TypeError: We only allow python sequences (lists, tuples or strings) of length 5. 44 45 The expected length is stored as a private attribute, 46 47 >>> x._length 48 5 49 50 In order that the SeqRecord (and other objects using this class) can be 51 pickled, for example for use in the multiprocessing library, we need to 52 be able to pickle the restricted dictionary objects. 53 54 Using the default protocol, which is 0 on Python 2.x, 55 56 >>> import pickle 57 >>> y = pickle.loads(pickle.dumps(x)) 58 >>> y 59 {'test': 'hello'} 60 >>> y._length 61 5 62 63 Using the highest protocol, which is 2 on Python 2.x, 64 65 >>> import pickle 66 >>> z = pickle.loads(pickle.dumps(x, pickle.HIGHEST_PROTOCOL)) 67 >>> z 68 {'test': 'hello'} 69 >>> z._length 70 5 71 """ 72
73 - def __init__(self, length):
74 """Create an EMPTY restricted dictionary.""" 75 dict.__init__(self) 76 self._length = int(length)
77
78 - def __setitem__(self, key, value):
79 # The check hasattr(self, "_length") is to cope with pickle protocol 2 80 # I couldn't seem to avoid this with __getstate__ and __setstate__ 81 if not hasattr(value, "__len__") or not hasattr(value, "__getitem__") \ 82 or (hasattr(self, "_length") and len(value) != self._length): 83 raise TypeError("We only allow python sequences (lists, tuples or " 84 "strings) of length {0}.".format(self._length)) 85 dict.__setitem__(self, key, value)
86
87 - def update(self, new_dict):
88 # Force this to go via our strict __setitem__ method 89 for (key, value) in new_dict.items(): 90 self[key] = value
91 92
93 -class SeqRecord(object):
94 """A SeqRecord object holds a sequence and information about it. 95 96 Main attributes: 97 - id - Identifier such as a locus tag (string) 98 - seq - The sequence itself (Seq object or similar) 99 100 Additional attributes: 101 - name - Sequence name, e.g. gene name (string) 102 - description - Additional text (string) 103 - dbxrefs - List of database cross references (list of strings) 104 - features - Any (sub)features defined (list of SeqFeature objects) 105 - annotations - Further information about the whole sequence (dictionary). 106 Most entries are strings, or lists of strings. 107 - letter_annotations - Per letter/symbol annotation (restricted 108 dictionary). This holds Python sequences (lists, strings 109 or tuples) whose length matches that of the sequence. 110 A typical use would be to hold a list of integers 111 representing sequencing quality scores, or a string 112 representing the secondary structure. 113 114 You will typically use Bio.SeqIO to read in sequences from files as 115 SeqRecord objects. However, you may want to create your own SeqRecord 116 objects directly (see the __init__ method for further details): 117 118 >>> from Bio.Seq import Seq 119 >>> from Bio.SeqRecord import SeqRecord 120 >>> from Bio.Alphabet import IUPAC 121 >>> record = SeqRecord(Seq("MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF", 122 ... IUPAC.protein), 123 ... id="YP_025292.1", name="HokC", 124 ... description="toxic membrane protein") 125 >>> print(record) 126 ID: YP_025292.1 127 Name: HokC 128 Description: toxic membrane protein 129 Number of features: 0 130 Seq('MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF', IUPACProtein()) 131 132 If you want to save SeqRecord objects to a sequence file, use Bio.SeqIO 133 for this. For the special case where you want the SeqRecord turned into 134 a string in a particular file format there is a format method which uses 135 Bio.SeqIO internally: 136 137 >>> print(record.format("fasta")) 138 >YP_025292.1 toxic membrane protein 139 MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF 140 <BLANKLINE> 141 142 You can also do things like slicing a SeqRecord, checking its length, etc 143 144 >>> len(record) 145 44 146 >>> edited = record[:10] + record[11:] 147 >>> print(edited.seq) 148 MKQHKAMIVAIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF 149 >>> print(record.seq) 150 MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF 151 152 """ 153
154 - def __init__(self, seq, id="<unknown id>", name="<unknown name>", 155 description="<unknown description>", dbxrefs=None, 156 features=None, annotations=None, 157 letter_annotations=None):
158 """Create a SeqRecord. 159 160 Arguments: 161 - seq - Sequence, required (Seq, MutableSeq or UnknownSeq) 162 - id - Sequence identifier, recommended (string) 163 - name - Sequence name, optional (string) 164 - description - Sequence description, optional (string) 165 - dbxrefs - Database cross references, optional (list of strings) 166 - features - Any (sub)features, optional (list of SeqFeature objects) 167 - annotations - Dictionary of annotations for the whole sequence 168 - letter_annotations - Dictionary of per-letter-annotations, values 169 should be strings, list or tuples of the same length as the full 170 sequence. 171 172 You will typically use Bio.SeqIO to read in sequences from files as 173 SeqRecord objects. However, you may want to create your own SeqRecord 174 objects directly. 175 176 Note that while an id is optional, we strongly recommend you supply a 177 unique id string for each record. This is especially important 178 if you wish to write your sequences to a file. 179 180 If you don't have the actual sequence, but you do know its length, 181 then using the UnknownSeq object from Bio.Seq is appropriate. 182 183 You can create a 'blank' SeqRecord object, and then populate the 184 attributes later. 185 """ 186 if id is not None and not isinstance(id, basestring): 187 # Lots of existing code uses id=None... this may be a bad idea. 188 raise TypeError("id argument should be a string") 189 if not isinstance(name, basestring): 190 raise TypeError("name argument should be a string") 191 if not isinstance(description, basestring): 192 raise TypeError("description argument should be a string") 193 self._seq = seq 194 self.id = id 195 self.name = name 196 self.description = description 197 198 # database cross references (for the whole sequence) 199 if dbxrefs is None: 200 dbxrefs = [] 201 elif not isinstance(dbxrefs, list): 202 raise TypeError("dbxrefs argument should be a list (of strings)") 203 self.dbxrefs = dbxrefs 204 205 # annotations about the whole sequence 206 if annotations is None: 207 annotations = {} 208 elif not isinstance(annotations, dict): 209 raise TypeError("annotations argument should be a dict") 210 self.annotations = annotations 211 212 if letter_annotations is None: 213 # annotations about each letter in the sequence 214 if seq is None: 215 # Should we allow this and use a normal unrestricted dict? 216 self._per_letter_annotations = _RestrictedDict(length=0) 217 else: 218 try: 219 self._per_letter_annotations = \ 220 _RestrictedDict(length=len(seq)) 221 except TypeError: 222 raise TypeError("seq argument should be a Seq object or similar") 223 else: 224 # This will be handled via the property set function, which will 225 # turn this into a _RestrictedDict and thus ensure all the values 226 # in the dict are the right length 227 self.letter_annotations = letter_annotations 228 229 # annotations about parts of the sequence 230 if features is None: 231 features = [] 232 elif not isinstance(features, list): 233 raise TypeError("features argument should be a list (of SeqFeature objects)") 234 self.features = features
235 236 # TODO - Just make this a read only property?
237 - def _set_per_letter_annotations(self, value):
238 if not isinstance(value, dict): 239 raise TypeError("The per-letter-annotations should be a " 240 "(restricted) dictionary.") 241 # Turn this into a restricted-dictionary (and check the entries) 242 try: 243 self._per_letter_annotations = _RestrictedDict(length=len(self.seq)) 244 except AttributeError: 245 # e.g. seq is None 246 self._per_letter_annotations = _RestrictedDict(length=0) 247 self._per_letter_annotations.update(value)
248 letter_annotations = property( 249 fget=lambda self: self._per_letter_annotations, 250 fset=_set_per_letter_annotations, 251 doc="""Dictionary of per-letter-annotation for the sequence. 252 253 For example, this can hold quality scores used in FASTQ or QUAL files. 254 Consider this example using Bio.SeqIO to read in an example Solexa 255 variant FASTQ file as a SeqRecord: 256 257 >>> from Bio import SeqIO 258 >>> record = SeqIO.read("Quality/solexa_faked.fastq", "fastq-solexa") 259 >>> print("%s %s" % (record.id, record.seq)) 260 slxa_0001_1_0001_01 ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTNNNNNN 261 >>> print(list(record.letter_annotations)) 262 ['solexa_quality'] 263 >>> print(record.letter_annotations["solexa_quality"]) 264 [40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, -1, -2, -3, -4, -5] 265 266 The letter_annotations get sliced automatically if you slice the 267 parent SeqRecord, for example taking the last ten bases: 268 269 >>> sub_record = record[-10:] 270 >>> print("%s %s" % (sub_record.id, sub_record.seq)) 271 slxa_0001_1_0001_01 ACGTNNNNNN 272 >>> print(sub_record.letter_annotations["solexa_quality"]) 273 [4, 3, 2, 1, 0, -1, -2, -3, -4, -5] 274 275 Any python sequence (i.e. list, tuple or string) can be recorded in 276 the SeqRecord's letter_annotations dictionary as long as the length 277 matches that of the SeqRecord's sequence. e.g. 278 279 >>> len(sub_record.letter_annotations) 280 1 281 >>> sub_record.letter_annotations["dummy"] = "abcdefghij" 282 >>> len(sub_record.letter_annotations) 283 2 284 285 You can delete entries from the letter_annotations dictionary as usual: 286 287 >>> del sub_record.letter_annotations["solexa_quality"] 288 >>> sub_record.letter_annotations 289 {'dummy': 'abcdefghij'} 290 291 You can completely clear the dictionary easily as follows: 292 293 >>> sub_record.letter_annotations = {} 294 >>> sub_record.letter_annotations 295 {} 296 297 Note that if replacing the record's sequence with a sequence of a 298 different length you must first clear the letter_annotations dict. 299 """) 300
301 - def _set_seq(self, value):
302 # TODO - Add a deprecation warning that the seq should be write only? 303 if self._per_letter_annotations: 304 if len(self) != len(value): 305 # TODO - Make this a warning? Silently empty the dictionary? 306 raise ValueError("You must empty the letter annotations first!") 307 else: 308 # Leave the existing per letter annotations unchanged: 309 self._seq = value 310 else: 311 self._seq = value 312 # Reset the (empty) letter annotations dict with new length: 313 try: 314 self._per_letter_annotations = _RestrictedDict(length=len(self.seq)) 315 except AttributeError: 316 # e.g. seq is None 317 self._per_letter_annotations = _RestrictedDict(length=0)
318 319 seq = property(fget=lambda self: self._seq, 320 fset=_set_seq, 321 doc="The sequence itself, as a Seq or MutableSeq object.") 322
323 - def __getitem__(self, index):
324 """Return a sub-sequence or an individual letter. 325 326 Slicing, e.g. my_record[5:10], returns a new SeqRecord for 327 that sub-sequence with appropriate annotation preserved. The 328 name, id and description are kept. 329 330 Any per-letter-annotations are sliced to match the requested 331 sub-sequence. Unless a stride is used, all those features 332 which fall fully within the subsequence are included (with 333 their locations adjusted accordingly). 334 335 However, the annotations dictionary and the dbxrefs list are 336 not used for the new SeqRecord, as in general they may not 337 apply to the subsequence. If you want to preserve them, you 338 must explicitly copy them to the new SeqRecord yourself. 339 340 Using an integer index, e.g. my_record[5] is shorthand for 341 extracting that letter from the sequence, my_record.seq[5]. 342 343 For example, consider this short protein and its secondary 344 structure as encoded by the PDB (e.g. H for alpha helices), 345 plus a simple feature for its histidine self phosphorylation 346 site: 347 348 >>> from Bio.Seq import Seq 349 >>> from Bio.SeqRecord import SeqRecord 350 >>> from Bio.SeqFeature import SeqFeature, FeatureLocation 351 >>> from Bio.Alphabet import IUPAC 352 >>> rec = SeqRecord(Seq("MAAGVKQLADDRTLLMAGVSHDLRTPLTRIRLAT" 353 ... "EMMSEQDGYLAESINKDIEECNAIIEQFIDYLR", 354 ... IUPAC.protein), 355 ... id="1JOY", name="EnvZ", 356 ... description="Homodimeric domain of EnvZ from E. coli") 357 >>> rec.letter_annotations["secondary_structure"] = " S SSSSSSHHHHHTTTHHHHHHHHHHHHHHHHHHHHHHTHHHHHHHHHHHHHHHHHHHHHTT " 358 >>> rec.features.append(SeqFeature(FeatureLocation(20, 21), 359 ... type = "Site")) 360 361 Now let's have a quick look at the full record, 362 363 >>> print(rec) 364 ID: 1JOY 365 Name: EnvZ 366 Description: Homodimeric domain of EnvZ from E. coli 367 Number of features: 1 368 Per letter annotation for: secondary_structure 369 Seq('MAAGVKQLADDRTLLMAGVSHDLRTPLTRIRLATEMMSEQDGYLAESINKDIEE...YLR', IUPACProtein()) 370 >>> rec.letter_annotations["secondary_structure"] 371 ' S SSSSSSHHHHHTTTHHHHHHHHHHHHHHHHHHHHHHTHHHHHHHHHHHHHHHHHHHHHTT ' 372 >>> print(rec.features[0].location) 373 [20:21] 374 375 Now let's take a sub sequence, here chosen as the first (fractured) 376 alpha helix which includes the histidine phosphorylation site: 377 378 >>> sub = rec[11:41] 379 >>> print(sub) 380 ID: 1JOY 381 Name: EnvZ 382 Description: Homodimeric domain of EnvZ from E. coli 383 Number of features: 1 384 Per letter annotation for: secondary_structure 385 Seq('RTLLMAGVSHDLRTPLTRIRLATEMMSEQD', IUPACProtein()) 386 >>> sub.letter_annotations["secondary_structure"] 387 'HHHHHTTTHHHHHHHHHHHHHHHHHHHHHH' 388 >>> print(sub.features[0].location) 389 [9:10] 390 391 You can also of course omit the start or end values, for 392 example to get the first ten letters only: 393 394 >>> print(rec[:10]) 395 ID: 1JOY 396 Name: EnvZ 397 Description: Homodimeric domain of EnvZ from E. coli 398 Number of features: 0 399 Per letter annotation for: secondary_structure 400 Seq('MAAGVKQLAD', IUPACProtein()) 401 402 Or for the last ten letters: 403 404 >>> print(rec[-10:]) 405 ID: 1JOY 406 Name: EnvZ 407 Description: Homodimeric domain of EnvZ from E. coli 408 Number of features: 0 409 Per letter annotation for: secondary_structure 410 Seq('IIEQFIDYLR', IUPACProtein()) 411 412 If you omit both, then you get a copy of the original record (although 413 lacking the annotations and dbxrefs): 414 415 >>> print(rec[:]) 416 ID: 1JOY 417 Name: EnvZ 418 Description: Homodimeric domain of EnvZ from E. coli 419 Number of features: 1 420 Per letter annotation for: secondary_structure 421 Seq('MAAGVKQLADDRTLLMAGVSHDLRTPLTRIRLATEMMSEQDGYLAESINKDIEE...YLR', IUPACProtein()) 422 423 Finally, indexing with a simple integer is shorthand for pulling out 424 that letter from the sequence directly: 425 426 >>> rec[5] 427 'K' 428 >>> rec.seq[5] 429 'K' 430 """ 431 if isinstance(index, int): 432 # NOTE - The sequence level annotation like the id, name, etc 433 # do not really apply to a single character. However, should 434 # we try and expose any per-letter-annotation here? If so how? 435 return self.seq[index] 436 elif isinstance(index, slice): 437 if self.seq is None: 438 raise ValueError("If the sequence is None, we cannot slice it.") 439 parent_length = len(self) 440 try: 441 from BioSQL.BioSeq import DBSeqRecord 442 biosql_available = True 443 except ImportError: 444 biosql_available = False 445 446 if biosql_available and isinstance(self, DBSeqRecord): 447 answer = SeqRecord(self.seq[index], 448 id=self.id, 449 name=self.name, 450 description=self.description) 451 else: 452 answer = self.__class__(self.seq[index], 453 id=self.id, 454 name=self.name, 455 description=self.description) 456 # TODO - The description may no longer apply. 457 # It would be safer to change it to something 458 # generic like "edited" or the default value. 459 460 # Don't copy the annotation dict and dbxefs list, 461 # they may not apply to a subsequence. 462 # answer.annotations = dict(self.annotations.items()) 463 # answer.dbxrefs = self.dbxrefs[:] 464 # TODO - Review this in light of adding SeqRecord objects? 465 466 # TODO - Cope with strides by generating ambiguous locations? 467 start, stop, step = index.indices(parent_length) 468 if step == 1: 469 # Select relevant features, add them with shifted locations 470 # assert str(self.seq)[index] == str(self.seq)[start:stop] 471 for f in self.features: 472 if f.ref or f.ref_db: 473 # TODO - Implement this (with lots of tests)? 474 import warnings 475 warnings.warn("When slicing SeqRecord objects, any " 476 "SeqFeature referencing other sequences (e.g. " 477 "from segmented GenBank records) are ignored.") 478 continue 479 if start <= f.location.nofuzzy_start \ 480 and f.location.nofuzzy_end <= stop: 481 answer.features.append(f._shift(-start)) 482 483 # Slice all the values to match the sliced sequence 484 # (this should also work with strides, even negative strides): 485 for key, value in self.letter_annotations.items(): 486 answer._per_letter_annotations[key] = value[index] 487 488 return answer 489 raise ValueError("Invalid index")
490
491 - def __iter__(self):
492 """Iterate over the letters in the sequence. 493 494 For example, using Bio.SeqIO to read in a protein FASTA file: 495 496 >>> from Bio import SeqIO 497 >>> record = SeqIO.read("Fasta/loveliesbleeding.pro", "fasta") 498 >>> for amino in record: 499 ... print(amino) 500 ... if amino == "L": break 501 X 502 A 503 G 504 L 505 >>> print(record.seq[3]) 506 L 507 508 This is just a shortcut for iterating over the sequence directly: 509 510 >>> for amino in record.seq: 511 ... print(amino) 512 ... if amino == "L": break 513 X 514 A 515 G 516 L 517 >>> print(record.seq[3]) 518 L 519 520 Note that this does not facilitate iteration together with any 521 per-letter-annotation. However, you can achieve that using the 522 python zip function on the record (or its sequence) and the relevant 523 per-letter-annotation: 524 525 >>> from Bio import SeqIO 526 >>> rec = SeqIO.read("Quality/solexa_faked.fastq", "fastq-solexa") 527 >>> print("%s %s" % (rec.id, rec.seq)) 528 slxa_0001_1_0001_01 ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTNNNNNN 529 >>> print(list(rec.letter_annotations)) 530 ['solexa_quality'] 531 >>> for nuc, qual in zip(rec, rec.letter_annotations["solexa_quality"]): 532 ... if qual > 35: 533 ... print("%s %i" % (nuc, qual)) 534 A 40 535 C 39 536 G 38 537 T 37 538 A 36 539 540 You may agree that using zip(rec.seq, ...) is more explicit than using 541 zip(rec, ...) as shown above. 542 """ 543 return iter(self.seq)
544
545 - def __contains__(self, char):
546 """Implement the 'in' keyword, searches the sequence. 547 548 e.g. 549 550 >>> from Bio import SeqIO 551 >>> record = SeqIO.read("Fasta/sweetpea.nu", "fasta") 552 >>> "GAATTC" in record 553 False 554 >>> "AAA" in record 555 True 556 557 This essentially acts as a proxy for using "in" on the sequence: 558 559 >>> "GAATTC" in record.seq 560 False 561 >>> "AAA" in record.seq 562 True 563 564 Note that you can also use Seq objects as the query, 565 566 >>> from Bio.Seq import Seq 567 >>> from Bio.Alphabet import generic_dna 568 >>> Seq("AAA") in record 569 True 570 >>> Seq("AAA", generic_dna) in record 571 True 572 573 See also the Seq object's __contains__ method. 574 """ 575 return char in self.seq
576
577 - def __str__(self):
578 """Return a human readable summary of the record and its annotation (string). 579 580 The python built in function str works by calling the object's ___str__ 581 method. e.g. 582 583 >>> from Bio.Seq import Seq 584 >>> from Bio.SeqRecord import SeqRecord 585 >>> from Bio.Alphabet import IUPAC 586 >>> record = SeqRecord(Seq("MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF", 587 ... IUPAC.protein), 588 ... id="YP_025292.1", name="HokC", 589 ... description="toxic membrane protein, small") 590 >>> print(str(record)) 591 ID: YP_025292.1 592 Name: HokC 593 Description: toxic membrane protein, small 594 Number of features: 0 595 Seq('MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF', IUPACProtein()) 596 597 In this example you don't actually need to call str explicity, as the 598 print command does this automatically: 599 600 >>> print(record) 601 ID: YP_025292.1 602 Name: HokC 603 Description: toxic membrane protein, small 604 Number of features: 0 605 Seq('MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF', IUPACProtein()) 606 607 Note that long sequences are shown truncated. 608 """ 609 lines = [] 610 if self.id: 611 lines.append("ID: {0}".format(self.id)) 612 if self.name: 613 lines.append("Name: {0}".format(self.name)) 614 if self.description: 615 lines.append("Description: {0}".format(self.description)) 616 if self.dbxrefs: 617 lines.append("Database cross-references: " + ", ".join(self.dbxrefs)) 618 lines.append("Number of features: {0}".format(len(self.features))) 619 for a in self.annotations: 620 lines.append("/{0}={1}".format(a, str(self.annotations[a]))) 621 if self.letter_annotations: 622 lines.append("Per letter annotation for: " + ", ".join(self.letter_annotations)) 623 # Don't want to include the entire sequence, 624 # and showing the alphabet is useful: 625 lines.append(repr(self.seq)) 626 return "\n".join(lines)
627
628 - def __repr__(self):
629 """Return a concise summary of the record for debugging (string). 630 631 The python built in function repr works by calling the object's ___repr__ 632 method. e.g. 633 634 >>> from Bio.Seq import Seq 635 >>> from Bio.SeqRecord import SeqRecord 636 >>> from Bio.Alphabet import generic_protein 637 >>> rec = SeqRecord(Seq("MASRGVNKVILVGNLGQDPEVRYMPNGGAVANITLATSESWRDKAT" 638 ... +"GEMKEQTEWHRVVLFGKLAEVASEYLRKGSQVYIEGQLRTRKWTDQ" 639 ... +"SGQDRYTTEVVVNVGGTMQMLGGRQGGGAPAGGNIGGGQPQGGWGQ" 640 ... +"PQQPQGGNQFSGGAQSRPQQSAPAAPSNEPPMDFDDDIPF", 641 ... generic_protein), 642 ... id="NP_418483.1", name="b4059", 643 ... description="ssDNA-binding protein", 644 ... dbxrefs=["ASAP:13298", "GI:16131885", "GeneID:948570"]) 645 >>> print(repr(rec)) 646 SeqRecord(seq=Seq('MASRGVNKVILVGNLGQDPEVRYMPNGGAVANITLATSESWRDKATGEMKEQTE...IPF', ProteinAlphabet()), id='NP_418483.1', name='b4059', description='ssDNA-binding protein', dbxrefs=['ASAP:13298', 'GI:16131885', 'GeneID:948570']) 647 648 At the python prompt you can also use this shorthand: 649 650 >>> rec 651 SeqRecord(seq=Seq('MASRGVNKVILVGNLGQDPEVRYMPNGGAVANITLATSESWRDKATGEMKEQTE...IPF', ProteinAlphabet()), id='NP_418483.1', name='b4059', description='ssDNA-binding protein', dbxrefs=['ASAP:13298', 'GI:16131885', 'GeneID:948570']) 652 653 Note that long sequences are shown truncated. Also note that any 654 annotations, letter_annotations and features are not shown (as they 655 would lead to a very long string). 656 """ 657 return "{0}(seq={1!r}, id={2!r}, name={3!r}, description={4!r}, dbxrefs={5!r})".format( 658 self.__class__.__name__, 659 self.seq, self.id, self.name, 660 self.description, self.dbxrefs)
661
662 - def format(self, format):
663 r"""Return the record as a string in the specified file format. 664 665 The format should be a lower case string supported as an output 666 format by Bio.SeqIO, which is used to turn the SeqRecord into a 667 string. e.g. 668 669 >>> from Bio.Seq import Seq 670 >>> from Bio.SeqRecord import SeqRecord 671 >>> from Bio.Alphabet import IUPAC 672 >>> record = SeqRecord(Seq("MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF", 673 ... IUPAC.protein), 674 ... id="YP_025292.1", name="HokC", 675 ... description="toxic membrane protein") 676 >>> record.format("fasta") 677 '>YP_025292.1 toxic membrane protein\nMKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF\n' 678 >>> print(record.format("fasta")) 679 >YP_025292.1 toxic membrane protein 680 MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF 681 <BLANKLINE> 682 683 The python print command automatically appends a new line, meaning 684 in this example a blank line is shown. If you look at the string 685 representation you can see there is a trailing new line (shown as 686 slash n) which is important when writing to a file or if 687 concatenating multiple sequence strings together. 688 689 Note that this method will NOT work on every possible file format 690 supported by Bio.SeqIO (e.g. some are for multiple sequences only). 691 """ 692 # See also the __format__ added for Python 2.6 / 3.0, PEP 3101 693 # See also the Bio.Align.Generic.Alignment class and its format() 694 return self.__format__(format)
695
696 - def __format__(self, format_spec):
697 """Return the record as a string in the specified file format. 698 699 This method supports the python format() function added in 700 Python 2.6/3.0. The format_spec should be a lower case string 701 supported by Bio.SeqIO as an output file format. See also the 702 SeqRecord's format() method. 703 704 Under Python 3 please note that for binary formats a bytes 705 string is returned, otherwise a (unicode) string is returned. 706 """ 707 if not format_spec: 708 # Follow python convention and default to using __str__ 709 return str(self) 710 from Bio import SeqIO 711 712 # Easy case, can call string-building function directly 713 if format_spec in SeqIO._FormatToString: 714 return SeqIO._FormatToString[format_spec](self) 715 716 # Harder case, make a temp handle instead 717 if format_spec in SeqIO._BinaryFormats: 718 # Return bytes on Python 3 719 from io import BytesIO 720 handle = BytesIO() 721 else: 722 from Bio._py3k import StringIO 723 handle = StringIO() 724 SeqIO.write(self, handle, format_spec) 725 return handle.getvalue()
726
727 - def __len__(self):
728 """Return the length of the sequence. 729 730 For example, using Bio.SeqIO to read in a FASTA nucleotide file: 731 732 >>> from Bio import SeqIO 733 >>> record = SeqIO.read("Fasta/sweetpea.nu", "fasta") 734 >>> len(record) 735 309 736 >>> len(record.seq) 737 309 738 """ 739 return len(self.seq)
740
741 - def __lt__(self, other):
742 raise NotImplementedError(_NO_SEQRECORD_COMPARISON)
743
744 - def __le___(self, other):
745 raise NotImplementedError(_NO_SEQRECORD_COMPARISON)
746
747 - def __eq__(self, other):
748 raise NotImplementedError(_NO_SEQRECORD_COMPARISON)
749
750 - def __ne__(self, other):
751 raise NotImplementedError(_NO_SEQRECORD_COMPARISON)
752
753 - def __gt__(self, other):
754 raise NotImplementedError(_NO_SEQRECORD_COMPARISON)
755
756 - def __ge__(self, other):
757 raise NotImplementedError(_NO_SEQRECORD_COMPARISON)
758 759 # Make SeqRecord unhashable explicit, required for Python 2. 760 # See github issue 929 for related discussion. 761 __hash__ = None 762 763 # Note Python 3 does not use __cmp__ and there is no need to 764 # define __cmp__ on Python 2 as have all of _lt__ etc defined. 765 766 # Python 3:
767 - def __bool__(self):
768 """Boolean value of an instance of this class (True). 769 770 This behaviour is for backwards compatibility, since until the 771 __len__ method was added, a SeqRecord always evaluated as True. 772 773 Note that in comparison, a Seq object will evaluate to False if it 774 has a zero length sequence. 775 776 WARNING: The SeqRecord may in future evaluate to False when its 777 sequence is of zero length (in order to better match the Seq 778 object behaviour)! 779 """ 780 return True
781 782 # Python 2: 783 __nonzero__ = __bool__ 784
785 - def __add__(self, other):
786 """Add another sequence or string to this sequence. 787 788 The other sequence can be a SeqRecord object, a Seq object (or 789 similar, e.g. a MutableSeq) or a plain Python string. If you add 790 a plain string or a Seq (like) object, the new SeqRecord will simply 791 have this appended to the existing data. However, any per letter 792 annotation will be lost: 793 794 >>> from Bio import SeqIO 795 >>> record = SeqIO.read("Quality/solexa_faked.fastq", "fastq-solexa") 796 >>> print("%s %s" % (record.id, record.seq)) 797 slxa_0001_1_0001_01 ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTNNNNNN 798 >>> print(list(record.letter_annotations)) 799 ['solexa_quality'] 800 801 >>> new = record + "ACT" 802 >>> print("%s %s" % (new.id, new.seq)) 803 slxa_0001_1_0001_01 ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTNNNNNNACT 804 >>> print(list(new.letter_annotations)) 805 [] 806 807 The new record will attempt to combine the annotation, but for any 808 ambiguities (e.g. different names) it defaults to omitting that 809 annotation. 810 811 >>> from Bio import SeqIO 812 >>> with open("GenBank/pBAD30.gb") as handle: 813 ... plasmid = SeqIO.read(handle, "gb") 814 >>> print("%s %i" % (plasmid.id, len(plasmid))) 815 pBAD30 4923 816 817 Now let's cut the plasmid into two pieces, and join them back up the 818 other way round (i.e. shift the starting point on this plasmid, have 819 a look at the annotated features in the original file to see why this 820 particular split point might make sense): 821 822 >>> left = plasmid[:3765] 823 >>> right = plasmid[3765:] 824 >>> new = right + left 825 >>> print("%s %i" % (new.id, len(new))) 826 pBAD30 4923 827 >>> str(new.seq) == str(right.seq + left.seq) 828 True 829 >>> len(new.features) == len(left.features) + len(right.features) 830 True 831 832 When we add the left and right SeqRecord objects, their annotation 833 is all consistent, so it is all conserved in the new SeqRecord: 834 835 >>> new.id == left.id == right.id == plasmid.id 836 True 837 >>> new.name == left.name == right.name == plasmid.name 838 True 839 >>> new.description == plasmid.description 840 True 841 >>> new.annotations == left.annotations == right.annotations 842 True 843 >>> new.letter_annotations == plasmid.letter_annotations 844 True 845 >>> new.dbxrefs == left.dbxrefs == right.dbxrefs 846 True 847 848 However, we should point out that when we sliced the SeqRecord, 849 any annotations dictionary or dbxrefs list entries were lost. 850 You can explicitly copy them like this: 851 852 >>> new.annotations = plasmid.annotations.copy() 853 >>> new.dbxrefs = plasmid.dbxrefs[:] 854 """ 855 if not isinstance(other, SeqRecord): 856 # Assume it is a string or a Seq. 857 # Note can't transfer any per-letter-annotations 858 return SeqRecord(self.seq + other, 859 id=self.id, name=self.name, 860 description=self.description, 861 features=self.features[:], 862 annotations=self.annotations.copy(), 863 dbxrefs=self.dbxrefs[:]) 864 # Adding two SeqRecord objects... must merge annotation. 865 answer = SeqRecord(self.seq + other.seq, 866 features=self.features[:], 867 dbxrefs=self.dbxrefs[:]) 868 # Will take all the features and all the db cross refs, 869 length = len(self) 870 for f in other.features: 871 answer.features.append(f._shift(length)) 872 del length 873 for ref in other.dbxrefs: 874 if ref not in answer.dbxrefs: 875 answer.dbxrefs.append(ref) 876 # Take common id/name/description/annotation 877 if self.id == other.id: 878 answer.id = self.id 879 if self.name == other.name: 880 answer.name = self.name 881 if self.description == other.description: 882 answer.description = self.description 883 for k, v in self.annotations.items(): 884 if k in other.annotations and other.annotations[k] == v: 885 answer.annotations[k] = v 886 # Can append matching per-letter-annotation 887 for k, v in self.letter_annotations.items(): 888 if k in other.letter_annotations: 889 answer.letter_annotations[k] = v + other.letter_annotations[k] 890 return answer
891
892 - def __radd__(self, other):
893 """Add another sequence or string to this sequence (from the left). 894 895 This method handles adding a Seq object (or similar, e.g. MutableSeq) 896 or a plain Python string (on the left) to a SeqRecord (on the right). 897 See the __add__ method for more details, but for example: 898 899 >>> from Bio import SeqIO 900 >>> record = SeqIO.read("Quality/solexa_faked.fastq", "fastq-solexa") 901 >>> print("%s %s" % (record.id, record.seq)) 902 slxa_0001_1_0001_01 ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTNNNNNN 903 >>> print(list(record.letter_annotations)) 904 ['solexa_quality'] 905 906 >>> new = "ACT" + record 907 >>> print("%s %s" % (new.id, new.seq)) 908 slxa_0001_1_0001_01 ACTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTNNNNNN 909 >>> print(list(new.letter_annotations)) 910 [] 911 """ 912 if isinstance(other, SeqRecord): 913 raise RuntimeError("This should have happened via the __add__ of " 914 "the other SeqRecord being added!") 915 # Assume it is a string or a Seq. 916 # Note can't transfer any per-letter-annotations 917 offset = len(other) 918 return SeqRecord(other + self.seq, 919 id=self.id, name=self.name, 920 description=self.description, 921 features=[f._shift(offset) for f in self.features], 922 annotations=self.annotations.copy(), 923 dbxrefs=self.dbxrefs[:])
924
925 - def upper(self):
926 """Return a copy of the record with an upper case sequence. 927 928 All the annotation is preserved unchanged. e.g. 929 930 >>> from Bio.Alphabet import generic_dna 931 >>> from Bio.Seq import Seq 932 >>> from Bio.SeqRecord import SeqRecord 933 >>> record = SeqRecord(Seq("acgtACGT", generic_dna), id="Test", 934 ... description = "Made up for this example") 935 >>> record.letter_annotations["phred_quality"] = [1, 2, 3, 4, 5, 6, 7, 8] 936 >>> print(record.upper().format("fastq")) 937 @Test Made up for this example 938 ACGTACGT 939 + 940 "#$%&'() 941 <BLANKLINE> 942 943 Naturally, there is a matching lower method: 944 945 >>> print(record.lower().format("fastq")) 946 @Test Made up for this example 947 acgtacgt 948 + 949 "#$%&'() 950 <BLANKLINE> 951 """ 952 return SeqRecord(self.seq.upper(), 953 id=self.id, name=self.name, 954 description=self.description, 955 dbxrefs=self.dbxrefs[:], 956 features=self.features[:], 957 annotations=self.annotations.copy(), 958 letter_annotations=self.letter_annotations.copy())
959
960 - def lower(self):
961 """Return a copy of the record with a lower case sequence. 962 963 All the annotation is preserved unchanged. e.g. 964 965 >>> from Bio import SeqIO 966 >>> record = SeqIO.read("Fasta/aster.pro", "fasta") 967 >>> print(record.format("fasta")) 968 >gi|3298468|dbj|BAA31520.1| SAMIPF 969 GGHVNPAVTFGAFVGGNITLLRGIVYIIAQLLGSTVACLLLKFVTNDMAVGVFSLSAGVG 970 VTNALVFEIVMTFGLVYTVYATAIDPKKGSLGTIAPIAIGFIVGANI 971 <BLANKLINE> 972 >>> print(record.lower().format("fasta")) 973 >gi|3298468|dbj|BAA31520.1| SAMIPF 974 gghvnpavtfgafvggnitllrgivyiiaqllgstvaclllkfvtndmavgvfslsagvg 975 vtnalvfeivmtfglvytvyataidpkkgslgtiapiaigfivgani 976 <BLANKLINE> 977 978 To take a more annotation rich example, 979 980 >>> from Bio import SeqIO 981 >>> old = SeqIO.read("EMBL/TRBG361.embl", "embl") 982 >>> len(old.features) 983 3 984 >>> new = old.lower() 985 >>> len(old.features) == len(new.features) 986 True 987 >>> old.annotations["organism"] == new.annotations["organism"] 988 True 989 >>> old.dbxrefs == new.dbxrefs 990 True 991 """ 992 return SeqRecord(self.seq.lower(), 993 id=self.id, name=self.name, 994 description=self.description, 995 dbxrefs=self.dbxrefs[:], 996 features=self.features[:], 997 annotations=self.annotations.copy(), 998 letter_annotations=self.letter_annotations.copy())
999
1000 - def reverse_complement(self, id=False, name=False, description=False, 1001 features=True, annotations=False, 1002 letter_annotations=True, dbxrefs=False):
1003 """Return new SeqRecord with reverse complement sequence. 1004 1005 By default the new record does NOT preserve the sequence identifier, 1006 name, description, general annotation or database cross-references - 1007 these are unlikely to apply to the reversed sequence. 1008 1009 You can specify the returned record's id, name and description as 1010 strings, or True to keep that of the parent, or False for a default. 1011 1012 You can specify the returned record's features with a list of 1013 SeqFeature objects, or True to keep that of the parent, or False to 1014 omit them. The default is to keep the original features (with the 1015 strand and locations adjusted). 1016 1017 You can also specify both the returned record's annotations and 1018 letter_annotations as dictionaries, True to keep that of the parent, 1019 or False to omit them. The default is to keep the original 1020 annotations (with the letter annotations reversed). 1021 1022 To show what happens to the pre-letter annotations, consider an 1023 example Solexa variant FASTQ file with a single entry, which we'll 1024 read in as a SeqRecord: 1025 1026 >>> from Bio import SeqIO 1027 >>> record = SeqIO.read("Quality/solexa_faked.fastq", "fastq-solexa") 1028 >>> print("%s %s" % (record.id, record.seq)) 1029 slxa_0001_1_0001_01 ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTNNNNNN 1030 >>> print(list(record.letter_annotations)) 1031 ['solexa_quality'] 1032 >>> print(record.letter_annotations["solexa_quality"]) 1033 [40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, -1, -2, -3, -4, -5] 1034 1035 Now take the reverse complement, here we explicitly give a new 1036 identifier (the old identifier with a suffix): 1037 1038 >>> rc_record = record.reverse_complement(id=record.id + "_rc") 1039 >>> print("%s %s" % (rc_record.id, rc_record.seq)) 1040 slxa_0001_1_0001_01_rc NNNNNNACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT 1041 1042 Notice that the per-letter-annotations have also been reversed, 1043 although this may not be appropriate for all cases. 1044 1045 >>> print(rc_record.letter_annotations["solexa_quality"]) 1046 [-5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40] 1047 1048 Now for the features, we need a different example. Parsing a GenBank 1049 file is probably the easiest way to get an nice example with features 1050 in it... 1051 1052 >>> from Bio import SeqIO 1053 >>> with open("GenBank/pBAD30.gb") as handle: 1054 ... plasmid = SeqIO.read(handle, "gb") 1055 >>> print("%s %i" % (plasmid.id, len(plasmid))) 1056 pBAD30 4923 1057 >>> plasmid.seq 1058 Seq('GCTAGCGGAGTGTATACTGGCTTACTATGTTGGCACTGATGAGGGTGTCAGTGA...ATG', IUPACAmbiguousDNA()) 1059 >>> len(plasmid.features) 1060 13 1061 1062 Now, let's take the reverse complement of this whole plasmid: 1063 1064 >>> rc_plasmid = plasmid.reverse_complement(id=plasmid.id+"_rc") 1065 >>> print("%s %i" % (rc_plasmid.id, len(rc_plasmid))) 1066 pBAD30_rc 4923 1067 >>> rc_plasmid.seq 1068 Seq('CATGGGCAAATATTATACGCAAGGCGACAAGGTGCTGATGCCGCTGGCGATTCA...AGC', IUPACAmbiguousDNA()) 1069 >>> len(rc_plasmid.features) 1070 13 1071 1072 Let's compare the first CDS feature - it has gone from being the 1073 second feature (index 1) to the second last feature (index -2), its 1074 strand has changed, and the location switched round. 1075 1076 >>> print(plasmid.features[1]) 1077 type: CDS 1078 location: [1081:1960](-) 1079 qualifiers: 1080 Key: label, Value: ['araC'] 1081 Key: note, Value: ['araC regulator of the arabinose BAD promoter'] 1082 Key: vntifkey, Value: ['4'] 1083 <BLANKLINE> 1084 >>> print(rc_plasmid.features[-2]) 1085 type: CDS 1086 location: [2963:3842](+) 1087 qualifiers: 1088 Key: label, Value: ['araC'] 1089 Key: note, Value: ['araC regulator of the arabinose BAD promoter'] 1090 Key: vntifkey, Value: ['4'] 1091 <BLANKLINE> 1092 1093 You can check this new location, based on the length of the plasmid: 1094 1095 >>> len(plasmid) - 1081 1096 3842 1097 >>> len(plasmid) - 1960 1098 2963 1099 1100 Note that if the SeqFeature annotation includes any strand specific 1101 information (e.g. base changes for a SNP), this information is not 1102 amended, and would need correction after the reverse complement. 1103 1104 Note trying to reverse complement a protein SeqRecord raises an 1105 exception: 1106 1107 >>> from Bio.SeqRecord import SeqRecord 1108 >>> from Bio.Seq import Seq 1109 >>> from Bio.Alphabet import IUPAC 1110 >>> protein_rec = SeqRecord(Seq("MAIVMGR", IUPAC.protein), id="Test") 1111 >>> protein_rec.reverse_complement() 1112 Traceback (most recent call last): 1113 ... 1114 ValueError: Proteins do not have complements! 1115 1116 Also note you can reverse complement a SeqRecord using a MutableSeq: 1117 1118 >>> from Bio.SeqRecord import SeqRecord 1119 >>> from Bio.Seq import MutableSeq 1120 >>> from Bio.Alphabet import generic_dna 1121 >>> rec = SeqRecord(MutableSeq("ACGT", generic_dna), id="Test") 1122 >>> rec.seq[0] = "T" 1123 >>> print("%s %s" % (rec.id, rec.seq)) 1124 Test TCGT 1125 >>> rc = rec.reverse_complement(id=True) 1126 >>> print("%s %s" % (rc.id, rc.seq)) 1127 Test ACGA 1128 """ 1129 from Bio.Seq import MutableSeq # Lazy to avoid circular imports 1130 if isinstance(self.seq, MutableSeq): 1131 # Currently the MutableSeq reverse complement is in situ 1132 answer = SeqRecord(self.seq.toseq().reverse_complement()) 1133 else: 1134 answer = SeqRecord(self.seq.reverse_complement()) 1135 if isinstance(id, basestring): 1136 answer.id = id 1137 elif id: 1138 answer.id = self.id 1139 if isinstance(name, basestring): 1140 answer.name = name 1141 elif name: 1142 answer.name = self.name 1143 if isinstance(description, basestring): 1144 answer.description = description 1145 elif description: 1146 answer.description = self.description 1147 if isinstance(dbxrefs, list): 1148 answer.dbxrefs = dbxrefs 1149 elif dbxrefs: 1150 # Copy the old dbxrefs 1151 answer.dbxrefs = self.dbxrefs[:] 1152 if isinstance(features, list): 1153 answer.features = features 1154 elif features: 1155 # Copy the old features, adjusting location and string 1156 length = len(answer) 1157 answer.features = [f._flip(length) for f in self.features] 1158 # The old list should have been sorted by start location, 1159 # reversing it will leave it sorted by what is now the end position, 1160 # so we need to resort in case of overlapping features. 1161 # NOTE - In the common case of gene before CDS (and similar) with 1162 # the exact same locations, this will still maintain gene before CDS 1163 answer.features.sort(key=lambda x: x.location.start.position) 1164 if isinstance(annotations, dict): 1165 answer.annotations = annotations 1166 elif annotations: 1167 # Copy the old annotations, 1168 answer.annotations = self.annotations.copy() 1169 if isinstance(letter_annotations, dict): 1170 answer.letter_annotations = letter_annotations 1171 elif letter_annotations: 1172 # Copy the old per letter annotations, reversing them 1173 for key, value in self.letter_annotations.items(): 1174 answer._per_letter_annotations[key] = value[::-1] 1175 return answer
1176
1177 - def translate(self, 1178 # Seq translation arguments: 1179 table="Standard", stop_symbol="*", to_stop=False, 1180 cds=False, gap=None, 1181 # SeqRecord annotation arguments: 1182 id=False, name=False, description=False, 1183 features=False, annotations=False, 1184 letter_annotations=False, dbxrefs=False):
1185 """Return new SeqRecord with translated sequence. 1186 1187 This calls the record's .seq.translate() method (which describes 1188 the translation related arguments, like table for the genetic code), 1189 1190 By default the new record does NOT preserve the sequence identifier, 1191 name, description, general annotation or database cross-references - 1192 these are unlikely to apply to the translated sequence. 1193 1194 You can specify the returned record's id, name and description as 1195 strings, or True to keep that of the parent, or False for a default. 1196 1197 You can specify the returned record's features with a list of 1198 SeqFeature objects, or False (default) to omit them. 1199 1200 You can also specify both the returned record's annotations and 1201 letter_annotations as dictionaries, True to keep that of the parent 1202 (annotations only), or False (default) to omit them. 1203 1204 e.g. Loading a FASTA gene and translating it, 1205 1206 >>> from Bio import SeqIO 1207 >>> gene_record = SeqIO.read("Fasta/sweetpea.nu", "fasta") 1208 >>> print(gene_record.format("fasta")) 1209 >gi|3176602|gb|U78617.1|LOU78617 Lathyrus odoratus phytochrome A (PHYA) gene, partial cds 1210 CAGGCTGCGCGGTTTCTATTTATGAAGAACAAGGTCCGTATGATAGTTGATTGTCATGCA 1211 AAACATGTGAAGGTTCTTCAAGACGAAAAACTCCCATTTGATTTGACTCTGTGCGGTTCG 1212 ACCTTAAGAGCTCCACATAGTTGCCATTTGCAGTACATGGCTAACATGGATTCAATTGCT 1213 TCATTGGTTATGGCAGTGGTCGTCAATGACAGCGATGAAGATGGAGATAGCCGTGACGCA 1214 GTTCTACCACAAAAGAAAAAGAGACTTTGGGGTTTGGTAGTTTGTCATAACACTACTCCG 1215 AGGTTTGTT 1216 <BLANKLINE> 1217 1218 And now translating the record, specifying the new ID and description: 1219 1220 >>> protein_record = gene_record.translate(table=11, 1221 ... id="phya", 1222 ... description="translation") 1223 >>> print(protein_record.format("fasta")) 1224 >phya translation 1225 QAARFLFMKNKVRMIVDCHAKHVKVLQDEKLPFDLTLCGSTLRAPHSCHLQYMANMDSIA 1226 SLVMAVVVNDSDEDGDSRDAVLPQKKKRLWGLVVCHNTTPRFV 1227 <BLANKLINE> 1228 1229 """ 1230 answer = SeqRecord(self.seq.translate(table=table, 1231 stop_symbol=stop_symbol, 1232 to_stop=to_stop, 1233 cds=cds, 1234 gap=gap)) 1235 if isinstance(id, basestring): 1236 answer.id = id 1237 elif id: 1238 answer.id = self.id 1239 if isinstance(name, basestring): 1240 answer.name = name 1241 elif name: 1242 answer.name = self.name 1243 if isinstance(description, basestring): 1244 answer.description = description 1245 elif description: 1246 answer.description = self.description 1247 if isinstance(dbxrefs, list): 1248 answer.dbxrefs = dbxrefs 1249 elif dbxrefs: 1250 # Copy the old dbxrefs 1251 answer.dbxrefs = self.dbxrefs[:] 1252 if isinstance(features, list): 1253 answer.features = features 1254 elif features: 1255 # Does not make sense to copy old features as locations wrong 1256 raise TypeError("Unexpected features argument %r" % features) 1257 if isinstance(annotations, dict): 1258 answer.annotations = annotations 1259 elif annotations: 1260 # Copy the old annotations 1261 answer.annotations = self.annotations.copy() 1262 if isinstance(letter_annotations, dict): 1263 answer.letter_annotations = letter_annotations 1264 elif letter_annotations: 1265 # Does not make sense to copy these as length now wrong 1266 raise TypeError("Unexpected letter_annotations argument %r" % letter_annotations) 1267 return answer
1268 1269 1270 if __name__ == "__main__": 1271 from Bio._utils import run_doctest 1272 run_doctest() 1273