Package Bio :: Module SeqRecord
[hide private]
[frames] | no frames]

Source Code for Module Bio.SeqRecord

   1  # Copyright 2000-2002 Andrew Dalke.  All rights reserved. 
   2  # Copyright 2002-2004 Brad Chapman.  All rights reserved. 
   3  # Copyright 2006-2017 by Peter Cock.  All rights reserved. 
   4  # 
   5  # This file is part of the Biopython distribution and governed by your 
   6  # choice of the "Biopython License Agreement" or the "BSD 3-Clause License". 
   7  # Please see the LICENSE file that should have been included as part of this 
   8  # package. 
   9  """Represent a Sequence Record, a sequence with annotation.""" 
  10   
  11   
  12  from Bio._py3k import basestring 
  13   
  14  # NEEDS TO BE SYNCH WITH THE REST OF BIOPYTHON AND BIOPERL 
  15  # In particular, the SeqRecord and BioSQL.BioSeq.DBSeqRecord classes 
  16  # need to be in sync (this is the BioSQL "Database SeqRecord", see 
  17  # also BioSQL.BioSeq.DBSeq which is the "Database Seq" class) 
  18   
  19  _NO_SEQRECORD_COMPARISON = "SeqRecord comparison is deliberately not implemented. Explicitly compare the attributes of interest." 
  20   
  21   
22 -class _RestrictedDict(dict):
23 """Dict which only allows sequences of given length as values (PRIVATE). 24 25 This simple subclass of the Python dictionary is used in the SeqRecord 26 object for holding per-letter-annotations. This class is intended to 27 prevent simple errors by only allowing python sequences (e.g. lists, 28 strings and tuples) to be stored, and only if their length matches that 29 expected (the length of the SeqRecord's seq object). It cannot however 30 prevent the entries being edited in situ (for example appending entries 31 to a list). 32 33 >>> x = _RestrictedDict(5) 34 >>> x["test"] = "hello" 35 >>> x 36 {'test': 'hello'} 37 38 Adding entries which don't have the expected length are blocked: 39 40 >>> x["test"] = "hello world" 41 Traceback (most recent call last): 42 ... 43 TypeError: We only allow python sequences (lists, tuples or strings) of length 5. 44 45 The expected length is stored as a private attribute, 46 47 >>> x._length 48 5 49 50 In order that the SeqRecord (and other objects using this class) can be 51 pickled, for example for use in the multiprocessing library, we need to 52 be able to pickle the restricted dictionary objects. 53 54 Using the default protocol, which is 0 on Python 2.x, 55 56 >>> import pickle 57 >>> y = pickle.loads(pickle.dumps(x)) 58 >>> y 59 {'test': 'hello'} 60 >>> y._length 61 5 62 63 Using the highest protocol, which is 2 on Python 2.x, 64 65 >>> import pickle 66 >>> z = pickle.loads(pickle.dumps(x, pickle.HIGHEST_PROTOCOL)) 67 >>> z 68 {'test': 'hello'} 69 >>> z._length 70 5 71 """ 72
73 - def __init__(self, length):
74 """Create an EMPTY restricted dictionary.""" 75 dict.__init__(self) 76 self._length = int(length)
77
78 - def __setitem__(self, key, value):
79 # The check hasattr(self, "_length") is to cope with pickle protocol 2 80 # I couldn't seem to avoid this with __getstate__ and __setstate__ 81 if not hasattr(value, "__len__") or not hasattr(value, "__getitem__") \ 82 or (hasattr(self, "_length") and len(value) != self._length): 83 raise TypeError("We only allow python sequences (lists, tuples or " 84 "strings) of length {0}.".format(self._length)) 85 dict.__setitem__(self, key, value)
86
87 - def update(self, new_dict):
88 # Force this to go via our strict __setitem__ method 89 for (key, value) in new_dict.items(): 90 self[key] = value
91 92
93 -class SeqRecord(object):
94 """A SeqRecord object holds a sequence and information about it. 95 96 Main attributes: 97 - id - Identifier such as a locus tag (string) 98 - seq - The sequence itself (Seq object or similar) 99 100 Additional attributes: 101 - name - Sequence name, e.g. gene name (string) 102 - description - Additional text (string) 103 - dbxrefs - List of database cross references (list of strings) 104 - features - Any (sub)features defined (list of SeqFeature objects) 105 - annotations - Further information about the whole sequence (dictionary). 106 Most entries are strings, or lists of strings. 107 - letter_annotations - Per letter/symbol annotation (restricted 108 dictionary). This holds Python sequences (lists, strings 109 or tuples) whose length matches that of the sequence. 110 A typical use would be to hold a list of integers 111 representing sequencing quality scores, or a string 112 representing the secondary structure. 113 114 You will typically use Bio.SeqIO to read in sequences from files as 115 SeqRecord objects. However, you may want to create your own SeqRecord 116 objects directly (see the __init__ method for further details): 117 118 >>> from Bio.Seq import Seq 119 >>> from Bio.SeqRecord import SeqRecord 120 >>> from Bio.Alphabet import IUPAC 121 >>> record = SeqRecord(Seq("MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF", 122 ... IUPAC.protein), 123 ... id="YP_025292.1", name="HokC", 124 ... description="toxic membrane protein") 125 >>> print(record) 126 ID: YP_025292.1 127 Name: HokC 128 Description: toxic membrane protein 129 Number of features: 0 130 Seq('MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF', IUPACProtein()) 131 132 If you want to save SeqRecord objects to a sequence file, use Bio.SeqIO 133 for this. For the special case where you want the SeqRecord turned into 134 a string in a particular file format there is a format method which uses 135 Bio.SeqIO internally: 136 137 >>> print(record.format("fasta")) 138 >YP_025292.1 toxic membrane protein 139 MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF 140 <BLANKLINE> 141 142 You can also do things like slicing a SeqRecord, checking its length, etc 143 144 >>> len(record) 145 44 146 >>> edited = record[:10] + record[11:] 147 >>> print(edited.seq) 148 MKQHKAMIVAIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF 149 >>> print(record.seq) 150 MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF 151 152 """ 153
154 - def __init__(self, seq, id="<unknown id>", name="<unknown name>", 155 description="<unknown description>", dbxrefs=None, 156 features=None, annotations=None, 157 letter_annotations=None):
158 """Create a SeqRecord. 159 160 Arguments: 161 - seq - Sequence, required (Seq, MutableSeq or UnknownSeq) 162 - id - Sequence identifier, recommended (string) 163 - name - Sequence name, optional (string) 164 - description - Sequence description, optional (string) 165 - dbxrefs - Database cross references, optional (list of strings) 166 - features - Any (sub)features, optional (list of SeqFeature objects) 167 - annotations - Dictionary of annotations for the whole sequence 168 - letter_annotations - Dictionary of per-letter-annotations, values 169 should be strings, list or tuples of the same length as the full 170 sequence. 171 172 You will typically use Bio.SeqIO to read in sequences from files as 173 SeqRecord objects. However, you may want to create your own SeqRecord 174 objects directly. 175 176 Note that while an id is optional, we strongly recommend you supply a 177 unique id string for each record. This is especially important 178 if you wish to write your sequences to a file. 179 180 If you don't have the actual sequence, but you do know its length, 181 then using the UnknownSeq object from Bio.Seq is appropriate. 182 183 You can create a 'blank' SeqRecord object, and then populate the 184 attributes later. 185 """ 186 if id is not None and not isinstance(id, basestring): 187 # Lots of existing code uses id=None... this may be a bad idea. 188 raise TypeError("id argument should be a string") 189 if not isinstance(name, basestring): 190 raise TypeError("name argument should be a string") 191 if not isinstance(description, basestring): 192 raise TypeError("description argument should be a string") 193 self._seq = seq 194 self.id = id 195 self.name = name 196 self.description = description 197 198 # database cross references (for the whole sequence) 199 if dbxrefs is None: 200 dbxrefs = [] 201 elif not isinstance(dbxrefs, list): 202 raise TypeError("dbxrefs argument should be a list (of strings)") 203 self.dbxrefs = dbxrefs 204 205 # annotations about the whole sequence 206 if annotations is None: 207 annotations = {} 208 elif not isinstance(annotations, dict): 209 raise TypeError("annotations argument should be a dict") 210 self.annotations = annotations 211 212 if letter_annotations is None: 213 # annotations about each letter in the sequence 214 if seq is None: 215 # Should we allow this and use a normal unrestricted dict? 216 self._per_letter_annotations = _RestrictedDict(length=0) 217 else: 218 try: 219 self._per_letter_annotations = \ 220 _RestrictedDict(length=len(seq)) 221 except TypeError: 222 raise TypeError("seq argument should be a Seq object or similar") 223 else: 224 # This will be handled via the property set function, which will 225 # turn this into a _RestrictedDict and thus ensure all the values 226 # in the dict are the right length 227 self.letter_annotations = letter_annotations 228 229 # annotations about parts of the sequence 230 if features is None: 231 features = [] 232 elif not isinstance(features, list): 233 raise TypeError("features argument should be a list (of SeqFeature objects)") 234 self.features = features
235 236 # TODO - Just make this a read only property?
237 - def _set_per_letter_annotations(self, value):
238 if not isinstance(value, dict): 239 raise TypeError("The per-letter-annotations should be a " 240 "(restricted) dictionary.") 241 # Turn this into a restricted-dictionary (and check the entries) 242 try: 243 self._per_letter_annotations = _RestrictedDict(length=len(self.seq)) 244 except AttributeError: 245 # e.g. seq is None 246 self._per_letter_annotations = _RestrictedDict(length=0) 247 self._per_letter_annotations.update(value)
248 letter_annotations = property( 249 fget=lambda self: self._per_letter_annotations, 250 fset=_set_per_letter_annotations, 251 doc="""Dictionary of per-letter-annotation for the sequence. 252 253 For example, this can hold quality scores used in FASTQ or QUAL files. 254 Consider this example using Bio.SeqIO to read in an example Solexa 255 variant FASTQ file as a SeqRecord: 256 257 >>> from Bio import SeqIO 258 >>> record = SeqIO.read("Quality/solexa_faked.fastq", "fastq-solexa") 259 >>> print("%s %s" % (record.id, record.seq)) 260 slxa_0001_1_0001_01 ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTNNNNNN 261 >>> print(list(record.letter_annotations)) 262 ['solexa_quality'] 263 >>> print(record.letter_annotations["solexa_quality"]) 264 [40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, -1, -2, -3, -4, -5] 265 266 The letter_annotations get sliced automatically if you slice the 267 parent SeqRecord, for example taking the last ten bases: 268 269 >>> sub_record = record[-10:] 270 >>> print("%s %s" % (sub_record.id, sub_record.seq)) 271 slxa_0001_1_0001_01 ACGTNNNNNN 272 >>> print(sub_record.letter_annotations["solexa_quality"]) 273 [4, 3, 2, 1, 0, -1, -2, -3, -4, -5] 274 275 Any python sequence (i.e. list, tuple or string) can be recorded in 276 the SeqRecord's letter_annotations dictionary as long as the length 277 matches that of the SeqRecord's sequence. e.g. 278 279 >>> len(sub_record.letter_annotations) 280 1 281 >>> sub_record.letter_annotations["dummy"] = "abcdefghij" 282 >>> len(sub_record.letter_annotations) 283 2 284 285 You can delete entries from the letter_annotations dictionary as usual: 286 287 >>> del sub_record.letter_annotations["solexa_quality"] 288 >>> sub_record.letter_annotations 289 {'dummy': 'abcdefghij'} 290 291 You can completely clear the dictionary easily as follows: 292 293 >>> sub_record.letter_annotations = {} 294 >>> sub_record.letter_annotations 295 {} 296 297 Note that if replacing the record's sequence with a sequence of a 298 different length you must first clear the letter_annotations dict. 299 """) 300
301 - def _set_seq(self, value):
302 # TODO - Add a deprecation warning that the seq should be write only? 303 if self._per_letter_annotations: 304 if len(self) != len(value): 305 # TODO - Make this a warning? Silently empty the dictionary? 306 raise ValueError("You must empty the letter annotations first!") 307 else: 308 # Leave the existing per letter annotations unchanged: 309 self._seq = value 310 else: 311 self._seq = value 312 # Reset the (empty) letter annotations dict with new length: 313 try: 314 self._per_letter_annotations = _RestrictedDict(length=len(self.seq)) 315 except AttributeError: 316 # e.g. seq is None 317 self._per_letter_annotations = _RestrictedDict(length=0)
318 319 seq = property(fget=lambda self: self._seq, 320 fset=_set_seq, 321 doc="The sequence itself, as a Seq or MutableSeq object.") 322
323 - def __getitem__(self, index):
324 """Returns a sub-sequence or an individual letter. 325 326 Slicing, e.g. my_record[5:10], returns a new SeqRecord for 327 that sub-sequence with appropriate annotation preserved. The 328 name, id and description are kept. 329 330 Any per-letter-annotations are sliced to match the requested 331 sub-sequence. Unless a stride is used, all those features 332 which fall fully within the subsequence are included (with 333 their locations adjusted accordingly). 334 335 However, the annotations dictionary and the dbxrefs list are 336 not used for the new SeqRecord, as in general they may not 337 apply to the subsequence. If you want to preserve them, you 338 must explicitly copy them to the new SeqRecord yourself. 339 340 Using an integer index, e.g. my_record[5] is shorthand for 341 extracting that letter from the sequence, my_record.seq[5]. 342 343 For example, consider this short protein and its secondary 344 structure as encoded by the PDB (e.g. H for alpha helices), 345 plus a simple feature for its histidine self phosphorylation 346 site: 347 348 >>> from Bio.Seq import Seq 349 >>> from Bio.SeqRecord import SeqRecord 350 >>> from Bio.SeqFeature import SeqFeature, FeatureLocation 351 >>> from Bio.Alphabet import IUPAC 352 >>> rec = SeqRecord(Seq("MAAGVKQLADDRTLLMAGVSHDLRTPLTRIRLAT" 353 ... "EMMSEQDGYLAESINKDIEECNAIIEQFIDYLR", 354 ... IUPAC.protein), 355 ... id="1JOY", name="EnvZ", 356 ... description="Homodimeric domain of EnvZ from E. coli") 357 >>> rec.letter_annotations["secondary_structure"] = " S SSSSSSHHHHHTTTHHHHHHHHHHHHHHHHHHHHHHTHHHHHHHHHHHHHHHHHHHHHTT " 358 >>> rec.features.append(SeqFeature(FeatureLocation(20, 21), 359 ... type = "Site")) 360 361 Now let's have a quick look at the full record, 362 363 >>> print(rec) 364 ID: 1JOY 365 Name: EnvZ 366 Description: Homodimeric domain of EnvZ from E. coli 367 Number of features: 1 368 Per letter annotation for: secondary_structure 369 Seq('MAAGVKQLADDRTLLMAGVSHDLRTPLTRIRLATEMMSEQDGYLAESINKDIEE...YLR', IUPACProtein()) 370 >>> rec.letter_annotations["secondary_structure"] 371 ' S SSSSSSHHHHHTTTHHHHHHHHHHHHHHHHHHHHHHTHHHHHHHHHHHHHHHHHHHHHTT ' 372 >>> print(rec.features[0].location) 373 [20:21] 374 375 Now let's take a sub sequence, here chosen as the first (fractured) 376 alpha helix which includes the histidine phosphorylation site: 377 378 >>> sub = rec[11:41] 379 >>> print(sub) 380 ID: 1JOY 381 Name: EnvZ 382 Description: Homodimeric domain of EnvZ from E. coli 383 Number of features: 1 384 Per letter annotation for: secondary_structure 385 Seq('RTLLMAGVSHDLRTPLTRIRLATEMMSEQD', IUPACProtein()) 386 >>> sub.letter_annotations["secondary_structure"] 387 'HHHHHTTTHHHHHHHHHHHHHHHHHHHHHH' 388 >>> print(sub.features[0].location) 389 [9:10] 390 391 You can also of course omit the start or end values, for 392 example to get the first ten letters only: 393 394 >>> print(rec[:10]) 395 ID: 1JOY 396 Name: EnvZ 397 Description: Homodimeric domain of EnvZ from E. coli 398 Number of features: 0 399 Per letter annotation for: secondary_structure 400 Seq('MAAGVKQLAD', IUPACProtein()) 401 402 Or for the last ten letters: 403 404 >>> print(rec[-10:]) 405 ID: 1JOY 406 Name: EnvZ 407 Description: Homodimeric domain of EnvZ from E. coli 408 Number of features: 0 409 Per letter annotation for: secondary_structure 410 Seq('IIEQFIDYLR', IUPACProtein()) 411 412 If you omit both, then you get a copy of the original record (although 413 lacking the annotations and dbxrefs): 414 415 >>> print(rec[:]) 416 ID: 1JOY 417 Name: EnvZ 418 Description: Homodimeric domain of EnvZ from E. coli 419 Number of features: 1 420 Per letter annotation for: secondary_structure 421 Seq('MAAGVKQLADDRTLLMAGVSHDLRTPLTRIRLATEMMSEQDGYLAESINKDIEE...YLR', IUPACProtein()) 422 423 Finally, indexing with a simple integer is shorthand for pulling out 424 that letter from the sequence directly: 425 426 >>> rec[5] 427 'K' 428 >>> rec.seq[5] 429 'K' 430 """ 431 if isinstance(index, int): 432 # NOTE - The sequence level annotation like the id, name, etc 433 # do not really apply to a single character. However, should 434 # we try and expose any per-letter-annotation here? If so how? 435 return self.seq[index] 436 elif isinstance(index, slice): 437 if self.seq is None: 438 raise ValueError("If the sequence is None, we cannot slice it.") 439 parent_length = len(self) 440 from BioSQL.BioSeq import DBSeqRecord 441 if isinstance(self, DBSeqRecord): 442 answer = SeqRecord(self.seq[index], 443 id=self.id, 444 name=self.name, 445 description=self.description) 446 else: 447 answer = self.__class__(self.seq[index], 448 id=self.id, 449 name=self.name, 450 description=self.description) 451 # TODO - The description may no longer apply. 452 # It would be safer to change it to something 453 # generic like "edited" or the default value. 454 455 # Don't copy the annotation dict and dbxefs list, 456 # they may not apply to a subsequence. 457 # answer.annotations = dict(self.annotations.items()) 458 # answer.dbxrefs = self.dbxrefs[:] 459 # TODO - Review this in light of adding SeqRecord objects? 460 461 # TODO - Cope with strides by generating ambiguous locations? 462 start, stop, step = index.indices(parent_length) 463 if step == 1: 464 # Select relevant features, add them with shifted locations 465 # assert str(self.seq)[index] == str(self.seq)[start:stop] 466 for f in self.features: 467 if f.ref or f.ref_db: 468 # TODO - Implement this (with lots of tests)? 469 import warnings 470 warnings.warn("When slicing SeqRecord objects, any " 471 "SeqFeature referencing other sequences (e.g. " 472 "from segmented GenBank records) are ignored.") 473 continue 474 if start <= f.location.nofuzzy_start \ 475 and f.location.nofuzzy_end <= stop: 476 answer.features.append(f._shift(-start)) 477 478 # Slice all the values to match the sliced sequence 479 # (this should also work with strides, even negative strides): 480 for key, value in self.letter_annotations.items(): 481 answer._per_letter_annotations[key] = value[index] 482 483 return answer 484 raise ValueError("Invalid index")
485
486 - def __iter__(self):
487 """Iterate over the letters in the sequence. 488 489 For example, using Bio.SeqIO to read in a protein FASTA file: 490 491 >>> from Bio import SeqIO 492 >>> record = SeqIO.read("Fasta/loveliesbleeding.pro", "fasta") 493 >>> for amino in record: 494 ... print(amino) 495 ... if amino == "L": break 496 X 497 A 498 G 499 L 500 >>> print(record.seq[3]) 501 L 502 503 This is just a shortcut for iterating over the sequence directly: 504 505 >>> for amino in record.seq: 506 ... print(amino) 507 ... if amino == "L": break 508 X 509 A 510 G 511 L 512 >>> print(record.seq[3]) 513 L 514 515 Note that this does not facilitate iteration together with any 516 per-letter-annotation. However, you can achieve that using the 517 python zip function on the record (or its sequence) and the relevant 518 per-letter-annotation: 519 520 >>> from Bio import SeqIO 521 >>> rec = SeqIO.read("Quality/solexa_faked.fastq", "fastq-solexa") 522 >>> print("%s %s" % (rec.id, rec.seq)) 523 slxa_0001_1_0001_01 ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTNNNNNN 524 >>> print(list(rec.letter_annotations)) 525 ['solexa_quality'] 526 >>> for nuc, qual in zip(rec, rec.letter_annotations["solexa_quality"]): 527 ... if qual > 35: 528 ... print("%s %i" % (nuc, qual)) 529 A 40 530 C 39 531 G 38 532 T 37 533 A 36 534 535 You may agree that using zip(rec.seq, ...) is more explicit than using 536 zip(rec, ...) as shown above. 537 """ 538 return iter(self.seq)
539
540 - def __contains__(self, char):
541 """Implements the 'in' keyword, searches the sequence. 542 543 e.g. 544 545 >>> from Bio import SeqIO 546 >>> record = SeqIO.read("Fasta/sweetpea.nu", "fasta") 547 >>> "GAATTC" in record 548 False 549 >>> "AAA" in record 550 True 551 552 This essentially acts as a proxy for using "in" on the sequence: 553 554 >>> "GAATTC" in record.seq 555 False 556 >>> "AAA" in record.seq 557 True 558 559 Note that you can also use Seq objects as the query, 560 561 >>> from Bio.Seq import Seq 562 >>> from Bio.Alphabet import generic_dna 563 >>> Seq("AAA") in record 564 True 565 >>> Seq("AAA", generic_dna) in record 566 True 567 568 See also the Seq object's __contains__ method. 569 """ 570 return char in self.seq
571
572 - def __str__(self):
573 """A human readable summary of the record and its annotation (string). 574 575 The python built in function str works by calling the object's ___str__ 576 method. e.g. 577 578 >>> from Bio.Seq import Seq 579 >>> from Bio.SeqRecord import SeqRecord 580 >>> from Bio.Alphabet import IUPAC 581 >>> record = SeqRecord(Seq("MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF", 582 ... IUPAC.protein), 583 ... id="YP_025292.1", name="HokC", 584 ... description="toxic membrane protein, small") 585 >>> print(str(record)) 586 ID: YP_025292.1 587 Name: HokC 588 Description: toxic membrane protein, small 589 Number of features: 0 590 Seq('MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF', IUPACProtein()) 591 592 In this example you don't actually need to call str explicity, as the 593 print command does this automatically: 594 595 >>> print(record) 596 ID: YP_025292.1 597 Name: HokC 598 Description: toxic membrane protein, small 599 Number of features: 0 600 Seq('MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF', IUPACProtein()) 601 602 Note that long sequences are shown truncated. 603 """ 604 lines = [] 605 if self.id: 606 lines.append("ID: {0}".format(self.id)) 607 if self.name: 608 lines.append("Name: {0}".format(self.name)) 609 if self.description: 610 lines.append("Description: {0}".format(self.description)) 611 if self.dbxrefs: 612 lines.append("Database cross-references: " + ", ".join(self.dbxrefs)) 613 lines.append("Number of features: {0}".format(len(self.features))) 614 for a in self.annotations: 615 lines.append("/{0}={1}".format(a, str(self.annotations[a]))) 616 if self.letter_annotations: 617 lines.append("Per letter annotation for: " + ", ".join(self.letter_annotations)) 618 # Don't want to include the entire sequence, 619 # and showing the alphabet is useful: 620 lines.append(repr(self.seq)) 621 return "\n".join(lines)
622
623 - def __repr__(self):
624 """A concise summary of the record for debugging (string). 625 626 The python built in function repr works by calling the object's ___repr__ 627 method. e.g. 628 629 >>> from Bio.Seq import Seq 630 >>> from Bio.SeqRecord import SeqRecord 631 >>> from Bio.Alphabet import generic_protein 632 >>> rec = SeqRecord(Seq("MASRGVNKVILVGNLGQDPEVRYMPNGGAVANITLATSESWRDKAT" 633 ... +"GEMKEQTEWHRVVLFGKLAEVASEYLRKGSQVYIEGQLRTRKWTDQ" 634 ... +"SGQDRYTTEVVVNVGGTMQMLGGRQGGGAPAGGNIGGGQPQGGWGQ" 635 ... +"PQQPQGGNQFSGGAQSRPQQSAPAAPSNEPPMDFDDDIPF", 636 ... generic_protein), 637 ... id="NP_418483.1", name="b4059", 638 ... description="ssDNA-binding protein", 639 ... dbxrefs=["ASAP:13298", "GI:16131885", "GeneID:948570"]) 640 >>> print(repr(rec)) 641 SeqRecord(seq=Seq('MASRGVNKVILVGNLGQDPEVRYMPNGGAVANITLATSESWRDKATGEMKEQTE...IPF', ProteinAlphabet()), id='NP_418483.1', name='b4059', description='ssDNA-binding protein', dbxrefs=['ASAP:13298', 'GI:16131885', 'GeneID:948570']) 642 643 At the python prompt you can also use this shorthand: 644 645 >>> rec 646 SeqRecord(seq=Seq('MASRGVNKVILVGNLGQDPEVRYMPNGGAVANITLATSESWRDKATGEMKEQTE...IPF', ProteinAlphabet()), id='NP_418483.1', name='b4059', description='ssDNA-binding protein', dbxrefs=['ASAP:13298', 'GI:16131885', 'GeneID:948570']) 647 648 Note that long sequences are shown truncated. Also note that any 649 annotations, letter_annotations and features are not shown (as they 650 would lead to a very long string). 651 """ 652 return "{0}(seq={1!r}, id={2!r}, name={3!r}, description={4!r}, dbxrefs={5!r})".format( 653 self.__class__.__name__, 654 self.seq, self.id, self.name, 655 self.description, self.dbxrefs)
656
657 - def format(self, format):
658 r"""Returns the record as a string in the specified file format. 659 660 The format should be a lower case string supported as an output 661 format by Bio.SeqIO, which is used to turn the SeqRecord into a 662 string. e.g. 663 664 >>> from Bio.Seq import Seq 665 >>> from Bio.SeqRecord import SeqRecord 666 >>> from Bio.Alphabet import IUPAC 667 >>> record = SeqRecord(Seq("MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF", 668 ... IUPAC.protein), 669 ... id="YP_025292.1", name="HokC", 670 ... description="toxic membrane protein") 671 >>> record.format("fasta") 672 '>YP_025292.1 toxic membrane protein\nMKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF\n' 673 >>> print(record.format("fasta")) 674 >YP_025292.1 toxic membrane protein 675 MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF 676 <BLANKLINE> 677 678 The python print command automatically appends a new line, meaning 679 in this example a blank line is shown. If you look at the string 680 representation you can see there is a trailing new line (shown as 681 slash n) which is important when writing to a file or if 682 concatenating multiple sequence strings together. 683 684 Note that this method will NOT work on every possible file format 685 supported by Bio.SeqIO (e.g. some are for multiple sequences only). 686 """ 687 # See also the __format__ added for Python 2.6 / 3.0, PEP 3101 688 # See also the Bio.Align.Generic.Alignment class and its format() 689 return self.__format__(format)
690
691 - def __format__(self, format_spec):
692 """Returns the record as a string in the specified file format. 693 694 This method supports the python format() function added in 695 Python 2.6/3.0. The format_spec should be a lower case string 696 supported by Bio.SeqIO as an output file format. See also the 697 SeqRecord's format() method. 698 699 Under Python 3 please note that for binary formats a bytes 700 string is returned, otherwise a (unicode) string is returned. 701 """ 702 if not format_spec: 703 # Follow python convention and default to using __str__ 704 return str(self) 705 from Bio import SeqIO 706 if format_spec in SeqIO._BinaryFormats: 707 # Return bytes on Python 3 708 from io import BytesIO 709 handle = BytesIO() 710 else: 711 from Bio._py3k import StringIO 712 handle = StringIO() 713 SeqIO.write(self, handle, format_spec) 714 return handle.getvalue()
715
716 - def __len__(self):
717 """Returns the length of the sequence. 718 719 For example, using Bio.SeqIO to read in a FASTA nucleotide file: 720 721 >>> from Bio import SeqIO 722 >>> record = SeqIO.read("Fasta/sweetpea.nu", "fasta") 723 >>> len(record) 724 309 725 >>> len(record.seq) 726 309 727 """ 728 return len(self.seq)
729
730 - def __lt__(self, other):
731 raise NotImplementedError(_NO_SEQRECORD_COMPARISON)
732
733 - def __le___(self, other):
734 raise NotImplementedError(_NO_SEQRECORD_COMPARISON)
735
736 - def __eq__(self, other):
737 raise NotImplementedError(_NO_SEQRECORD_COMPARISON)
738
739 - def __ne__(self, other):
740 raise NotImplementedError(_NO_SEQRECORD_COMPARISON)
741
742 - def __gt__(self, other):
743 raise NotImplementedError(_NO_SEQRECORD_COMPARISON)
744
745 - def __ge__(self, other):
746 raise NotImplementedError(_NO_SEQRECORD_COMPARISON)
747 748 # Note Python 3 does not use __cmp__ and there is no need to 749 # define __cmp__ on Python 2 as have all of _lt__ etc defined. 750 751 # Python 3:
752 - def __bool__(self):
753 """Boolean value of an instance of this class (True). 754 755 This behaviour is for backwards compatibility, since until the 756 __len__ method was added, a SeqRecord always evaluated as True. 757 758 Note that in comparison, a Seq object will evaluate to False if it 759 has a zero length sequence. 760 761 WARNING: The SeqRecord may in future evaluate to False when its 762 sequence is of zero length (in order to better match the Seq 763 object behaviour)! 764 """ 765 return True
766 767 # Python 2: 768 __nonzero__ = __bool__ 769
770 - def __add__(self, other):
771 """Add another sequence or string to this sequence. 772 773 The other sequence can be a SeqRecord object, a Seq object (or 774 similar, e.g. a MutableSeq) or a plain Python string. If you add 775 a plain string or a Seq (like) object, the new SeqRecord will simply 776 have this appended to the existing data. However, any per letter 777 annotation will be lost: 778 779 >>> from Bio import SeqIO 780 >>> record = SeqIO.read("Quality/solexa_faked.fastq", "fastq-solexa") 781 >>> print("%s %s" % (record.id, record.seq)) 782 slxa_0001_1_0001_01 ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTNNNNNN 783 >>> print(list(record.letter_annotations)) 784 ['solexa_quality'] 785 786 >>> new = record + "ACT" 787 >>> print("%s %s" % (new.id, new.seq)) 788 slxa_0001_1_0001_01 ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTNNNNNNACT 789 >>> print(list(new.letter_annotations)) 790 [] 791 792 The new record will attempt to combine the annotation, but for any 793 ambiguities (e.g. different names) it defaults to omitting that 794 annotation. 795 796 >>> from Bio import SeqIO 797 >>> with open("GenBank/pBAD30.gb") as handle: 798 ... plasmid = SeqIO.read(handle, "gb") 799 >>> print("%s %i" % (plasmid.id, len(plasmid))) 800 pBAD30 4923 801 802 Now let's cut the plasmid into two pieces, and join them back up the 803 other way round (i.e. shift the starting point on this plasmid, have 804 a look at the annotated features in the original file to see why this 805 particular split point might make sense): 806 807 >>> left = plasmid[:3765] 808 >>> right = plasmid[3765:] 809 >>> new = right + left 810 >>> print("%s %i" % (new.id, len(new))) 811 pBAD30 4923 812 >>> str(new.seq) == str(right.seq + left.seq) 813 True 814 >>> len(new.features) == len(left.features) + len(right.features) 815 True 816 817 When we add the left and right SeqRecord objects, their annotation 818 is all consistent, so it is all conserved in the new SeqRecord: 819 820 >>> new.id == left.id == right.id == plasmid.id 821 True 822 >>> new.name == left.name == right.name == plasmid.name 823 True 824 >>> new.description == plasmid.description 825 True 826 >>> new.annotations == left.annotations == right.annotations 827 True 828 >>> new.letter_annotations == plasmid.letter_annotations 829 True 830 >>> new.dbxrefs == left.dbxrefs == right.dbxrefs 831 True 832 833 However, we should point out that when we sliced the SeqRecord, 834 any annotations dictionary or dbxrefs list entries were lost. 835 You can explicitly copy them like this: 836 837 >>> new.annotations = plasmid.annotations.copy() 838 >>> new.dbxrefs = plasmid.dbxrefs[:] 839 """ 840 if not isinstance(other, SeqRecord): 841 # Assume it is a string or a Seq. 842 # Note can't transfer any per-letter-annotations 843 return SeqRecord(self.seq + other, 844 id=self.id, name=self.name, 845 description=self.description, 846 features=self.features[:], 847 annotations=self.annotations.copy(), 848 dbxrefs=self.dbxrefs[:]) 849 # Adding two SeqRecord objects... must merge annotation. 850 answer = SeqRecord(self.seq + other.seq, 851 features=self.features[:], 852 dbxrefs=self.dbxrefs[:]) 853 # Will take all the features and all the db cross refs, 854 l = len(self) 855 for f in other.features: 856 answer.features.append(f._shift(l)) 857 del l 858 for ref in other.dbxrefs: 859 if ref not in answer.dbxrefs: 860 answer.dbxrefs.append(ref) 861 # Take common id/name/description/annotation 862 if self.id == other.id: 863 answer.id = self.id 864 if self.name == other.name: 865 answer.name = self.name 866 if self.description == other.description: 867 answer.description = self.description 868 for k, v in self.annotations.items(): 869 if k in other.annotations and other.annotations[k] == v: 870 answer.annotations[k] = v 871 # Can append matching per-letter-annotation 872 for k, v in self.letter_annotations.items(): 873 if k in other.letter_annotations: 874 answer.letter_annotations[k] = v + other.letter_annotations[k] 875 return answer
876
877 - def __radd__(self, other):
878 """Add another sequence or string to this sequence (from the left). 879 880 This method handles adding a Seq object (or similar, e.g. MutableSeq) 881 or a plain Python string (on the left) to a SeqRecord (on the right). 882 See the __add__ method for more details, but for example: 883 884 >>> from Bio import SeqIO 885 >>> record = SeqIO.read("Quality/solexa_faked.fastq", "fastq-solexa") 886 >>> print("%s %s" % (record.id, record.seq)) 887 slxa_0001_1_0001_01 ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTNNNNNN 888 >>> print(list(record.letter_annotations)) 889 ['solexa_quality'] 890 891 >>> new = "ACT" + record 892 >>> print("%s %s" % (new.id, new.seq)) 893 slxa_0001_1_0001_01 ACTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTNNNNNN 894 >>> print(list(new.letter_annotations)) 895 [] 896 """ 897 if isinstance(other, SeqRecord): 898 raise RuntimeError("This should have happened via the __add__ of " 899 "the other SeqRecord being added!") 900 # Assume it is a string or a Seq. 901 # Note can't transfer any per-letter-annotations 902 offset = len(other) 903 return SeqRecord(other + self.seq, 904 id=self.id, name=self.name, 905 description=self.description, 906 features=[f._shift(offset) for f in self.features], 907 annotations=self.annotations.copy(), 908 dbxrefs=self.dbxrefs[:])
909
910 - def upper(self):
911 """Returns a copy of the record with an upper case sequence. 912 913 All the annotation is preserved unchanged. e.g. 914 915 >>> from Bio.Alphabet import generic_dna 916 >>> from Bio.Seq import Seq 917 >>> from Bio.SeqRecord import SeqRecord 918 >>> record = SeqRecord(Seq("acgtACGT", generic_dna), id="Test", 919 ... description = "Made up for this example") 920 >>> record.letter_annotations["phred_quality"] = [1, 2, 3, 4, 5, 6, 7, 8] 921 >>> print(record.upper().format("fastq")) 922 @Test Made up for this example 923 ACGTACGT 924 + 925 "#$%&'() 926 <BLANKLINE> 927 928 Naturally, there is a matching lower method: 929 930 >>> print(record.lower().format("fastq")) 931 @Test Made up for this example 932 acgtacgt 933 + 934 "#$%&'() 935 <BLANKLINE> 936 """ 937 return SeqRecord(self.seq.upper(), 938 id=self.id, name=self.name, 939 description=self.description, 940 dbxrefs=self.dbxrefs[:], 941 features=self.features[:], 942 annotations=self.annotations.copy(), 943 letter_annotations=self.letter_annotations.copy())
944
945 - def lower(self):
946 """Returns a copy of the record with a lower case sequence. 947 948 All the annotation is preserved unchanged. e.g. 949 950 >>> from Bio import SeqIO 951 >>> record = SeqIO.read("Fasta/aster.pro", "fasta") 952 >>> print(record.format("fasta")) 953 >gi|3298468|dbj|BAA31520.1| SAMIPF 954 GGHVNPAVTFGAFVGGNITLLRGIVYIIAQLLGSTVACLLLKFVTNDMAVGVFSLSAGVG 955 VTNALVFEIVMTFGLVYTVYATAIDPKKGSLGTIAPIAIGFIVGANI 956 <BLANKLINE> 957 >>> print(record.lower().format("fasta")) 958 >gi|3298468|dbj|BAA31520.1| SAMIPF 959 gghvnpavtfgafvggnitllrgivyiiaqllgstvaclllkfvtndmavgvfslsagvg 960 vtnalvfeivmtfglvytvyataidpkkgslgtiapiaigfivgani 961 <BLANKLINE> 962 963 To take a more annotation rich example, 964 965 >>> from Bio import SeqIO 966 >>> old = SeqIO.read("EMBL/TRBG361.embl", "embl") 967 >>> len(old.features) 968 3 969 >>> new = old.lower() 970 >>> len(old.features) == len(new.features) 971 True 972 >>> old.annotations["organism"] == new.annotations["organism"] 973 True 974 >>> old.dbxrefs == new.dbxrefs 975 True 976 """ 977 return SeqRecord(self.seq.lower(), 978 id=self.id, name=self.name, 979 description=self.description, 980 dbxrefs=self.dbxrefs[:], 981 features=self.features[:], 982 annotations=self.annotations.copy(), 983 letter_annotations=self.letter_annotations.copy())
984
985 - def reverse_complement(self, id=False, name=False, description=False, 986 features=True, annotations=False, 987 letter_annotations=True, dbxrefs=False):
988 """Returns new SeqRecord with reverse complement sequence. 989 990 By default the new record does NOT preserve the sequence identifier, 991 name, description, general annotation or database cross-references - 992 these are unlikely to apply to the reversed sequence. 993 994 You can specify the returned record's id, name and description as 995 strings, or True to keep that of the parent, or False for a default. 996 997 You can specify the returned record's features with a list of 998 SeqFeature objects, or True to keep that of the parent, or False to 999 omit them. The default is to keep the original features (with the 1000 strand and locations adjusted). 1001 1002 You can also specify both the returned record's annotations and 1003 letter_annotations as dictionaries, True to keep that of the parent, 1004 or False to omit them. The default is to keep the original 1005 annotations (with the letter annotations reversed). 1006 1007 To show what happens to the pre-letter annotations, consider an 1008 example Solexa variant FASTQ file with a single entry, which we'll 1009 read in as a SeqRecord: 1010 1011 >>> from Bio import SeqIO 1012 >>> record = SeqIO.read("Quality/solexa_faked.fastq", "fastq-solexa") 1013 >>> print("%s %s" % (record.id, record.seq)) 1014 slxa_0001_1_0001_01 ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTNNNNNN 1015 >>> print(list(record.letter_annotations)) 1016 ['solexa_quality'] 1017 >>> print(record.letter_annotations["solexa_quality"]) 1018 [40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, -1, -2, -3, -4, -5] 1019 1020 Now take the reverse complement, here we explicitly give a new 1021 identifier (the old identifier with a suffix): 1022 1023 >>> rc_record = record.reverse_complement(id=record.id + "_rc") 1024 >>> print("%s %s" % (rc_record.id, rc_record.seq)) 1025 slxa_0001_1_0001_01_rc NNNNNNACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT 1026 1027 Notice that the per-letter-annotations have also been reversed, 1028 although this may not be appropriate for all cases. 1029 1030 >>> print(rc_record.letter_annotations["solexa_quality"]) 1031 [-5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40] 1032 1033 Now for the features, we need a different example. Parsing a GenBank 1034 file is probably the easiest way to get an nice example with features 1035 in it... 1036 1037 >>> from Bio import SeqIO 1038 >>> with open("GenBank/pBAD30.gb") as handle: 1039 ... plasmid = SeqIO.read(handle, "gb") 1040 >>> print("%s %i" % (plasmid.id, len(plasmid))) 1041 pBAD30 4923 1042 >>> plasmid.seq 1043 Seq('GCTAGCGGAGTGTATACTGGCTTACTATGTTGGCACTGATGAGGGTGTCAGTGA...ATG', IUPACAmbiguousDNA()) 1044 >>> len(plasmid.features) 1045 13 1046 1047 Now, let's take the reverse complement of this whole plasmid: 1048 1049 >>> rc_plasmid = plasmid.reverse_complement(id=plasmid.id+"_rc") 1050 >>> print("%s %i" % (rc_plasmid.id, len(rc_plasmid))) 1051 pBAD30_rc 4923 1052 >>> rc_plasmid.seq 1053 Seq('CATGGGCAAATATTATACGCAAGGCGACAAGGTGCTGATGCCGCTGGCGATTCA...AGC', IUPACAmbiguousDNA()) 1054 >>> len(rc_plasmid.features) 1055 13 1056 1057 Let's compare the first CDS feature - it has gone from being the 1058 second feature (index 1) to the second last feature (index -2), its 1059 strand has changed, and the location switched round. 1060 1061 >>> print(plasmid.features[1]) 1062 type: CDS 1063 location: [1081:1960](-) 1064 qualifiers: 1065 Key: label, Value: ['araC'] 1066 Key: note, Value: ['araC regulator of the arabinose BAD promoter'] 1067 Key: vntifkey, Value: ['4'] 1068 <BLANKLINE> 1069 >>> print(rc_plasmid.features[-2]) 1070 type: CDS 1071 location: [2963:3842](+) 1072 qualifiers: 1073 Key: label, Value: ['araC'] 1074 Key: note, Value: ['araC regulator of the arabinose BAD promoter'] 1075 Key: vntifkey, Value: ['4'] 1076 <BLANKLINE> 1077 1078 You can check this new location, based on the length of the plasmid: 1079 1080 >>> len(plasmid) - 1081 1081 3842 1082 >>> len(plasmid) - 1960 1083 2963 1084 1085 Note that if the SeqFeature annotation includes any strand specific 1086 information (e.g. base changes for a SNP), this information is not 1087 amended, and would need correction after the reverse complement. 1088 1089 Note trying to reverse complement a protein SeqRecord raises an 1090 exception: 1091 1092 >>> from Bio.SeqRecord import SeqRecord 1093 >>> from Bio.Seq import Seq 1094 >>> from Bio.Alphabet import IUPAC 1095 >>> protein_rec = SeqRecord(Seq("MAIVMGR", IUPAC.protein), id="Test") 1096 >>> protein_rec.reverse_complement() 1097 Traceback (most recent call last): 1098 ... 1099 ValueError: Proteins do not have complements! 1100 1101 Also note you can reverse complement a SeqRecord using a MutableSeq: 1102 1103 >>> from Bio.SeqRecord import SeqRecord 1104 >>> from Bio.Seq import MutableSeq 1105 >>> from Bio.Alphabet import generic_dna 1106 >>> rec = SeqRecord(MutableSeq("ACGT", generic_dna), id="Test") 1107 >>> rec.seq[0] = "T" 1108 >>> print("%s %s" % (rec.id, rec.seq)) 1109 Test TCGT 1110 >>> rc = rec.reverse_complement(id=True) 1111 >>> print("%s %s" % (rc.id, rc.seq)) 1112 Test ACGA 1113 """ 1114 from Bio.Seq import MutableSeq # Lazy to avoid circular imports 1115 if isinstance(self.seq, MutableSeq): 1116 # Currently the MutableSeq reverse complement is in situ 1117 answer = SeqRecord(self.seq.toseq().reverse_complement()) 1118 else: 1119 answer = SeqRecord(self.seq.reverse_complement()) 1120 if isinstance(id, basestring): 1121 answer.id = id 1122 elif id: 1123 answer.id = self.id 1124 if isinstance(name, basestring): 1125 answer.name = name 1126 elif name: 1127 answer.name = self.name 1128 if isinstance(description, basestring): 1129 answer.description = description 1130 elif description: 1131 answer.description = self.description 1132 if isinstance(dbxrefs, list): 1133 answer.dbxrefs = dbxrefs 1134 elif dbxrefs: 1135 # Copy the old dbxrefs 1136 answer.dbxrefs = self.dbxrefs[:] 1137 if isinstance(features, list): 1138 answer.features = features 1139 elif features: 1140 # Copy the old features, adjusting location and string 1141 l = len(answer) 1142 answer.features = [f._flip(l) for f in self.features] 1143 # The old list should have been sorted by start location, 1144 # reversing it will leave it sorted by what is now the end position, 1145 # so we need to resort in case of overlapping features. 1146 # NOTE - In the common case of gene before CDS (and similar) with 1147 # the exact same locations, this will still maintain gene before CDS 1148 answer.features.sort(key=lambda x: x.location.start.position) 1149 if isinstance(annotations, dict): 1150 answer.annotations = annotations 1151 elif annotations: 1152 # Copy the old annotations, 1153 answer.annotations = self.annotations.copy() 1154 if isinstance(letter_annotations, dict): 1155 answer.letter_annotations = letter_annotations 1156 elif letter_annotations: 1157 # Copy the old per letter annotations, reversing them 1158 for key, value in self.letter_annotations.items(): 1159 answer._per_letter_annotations[key] = value[::-1] 1160 return answer
1161 1162 1163 if __name__ == "__main__": 1164 from Bio._utils import run_doctest 1165 run_doctest() 1166