Package Bio :: Module SeqRecord
[hide private]
[frames] | no frames]

Source Code for Module Bio.SeqRecord

   1  # Copyright 2000-2002 Andrew Dalke. 
   2  # Copyright 2002-2004 Brad Chapman. 
   3  # Copyright 2006-2010 by Peter Cock. 
   4  # All rights reserved. 
   5  # This code is part of the Biopython distribution and governed by its 
   6  # license.  Please see the LICENSE file that should have been included 
   7  # as part of this package. 
   8  """Represent a Sequence Record, a sequence with annotation.""" 
   9   
  10   
  11  from Bio._py3k import basestring 
  12   
  13  # NEEDS TO BE SYNCH WITH THE REST OF BIOPYTHON AND BIOPERL 
  14  # In particular, the SeqRecord and BioSQL.BioSeq.DBSeqRecord classes 
  15  # need to be in sync (this is the BioSQL "Database SeqRecord", see 
  16  # also BioSQL.BioSeq.DBSeq which is the "Database Seq" class) 
  17   
  18  _NO_SEQRECORD_COMPARISON = "SeqRecord comparison is deliberately not implemented. Explicitly compare the attributes of interest." 
  19   
  20   
21 -class _RestrictedDict(dict):
22 """Dict which only allows sequences of given length as values (PRIVATE). 23 24 This simple subclass of the Python dictionary is used in the SeqRecord 25 object for holding per-letter-annotations. This class is intended to 26 prevent simple errors by only allowing python sequences (e.g. lists, 27 strings and tuples) to be stored, and only if their length matches that 28 expected (the length of the SeqRecord's seq object). It cannot however 29 prevent the entries being edited in situ (for example appending entries 30 to a list). 31 32 >>> x = _RestrictedDict(5) 33 >>> x["test"] = "hello" 34 >>> x 35 {'test': 'hello'} 36 37 Adding entries which don't have the expected length are blocked: 38 39 >>> x["test"] = "hello world" 40 Traceback (most recent call last): 41 ... 42 TypeError: We only allow python sequences (lists, tuples or strings) of length 5. 43 44 The expected length is stored as a private attribute, 45 46 >>> x._length 47 5 48 49 In order that the SeqRecord (and other objects using this class) can be 50 pickled, for example for use in the multiprocessing library, we need to 51 be able to pickle the restricted dictionary objects. 52 53 Using the default protocol, which is 0 on Python 2.x, 54 55 >>> import pickle 56 >>> y = pickle.loads(pickle.dumps(x)) 57 >>> y 58 {'test': 'hello'} 59 >>> y._length 60 5 61 62 Using the highest protocol, which is 2 on Python 2.x, 63 64 >>> import pickle 65 >>> z = pickle.loads(pickle.dumps(x, pickle.HIGHEST_PROTOCOL)) 66 >>> z 67 {'test': 'hello'} 68 >>> z._length 69 5 70 """ 71
72 - def __init__(self, length):
73 """Create an EMPTY restricted dictionary.""" 74 dict.__init__(self) 75 self._length = int(length)
76
77 - def __setitem__(self, key, value):
78 # The check hasattr(self, "_length") is to cope with pickle protocol 2 79 # I couldn't seem to avoid this with __getstate__ and __setstate__ 80 if not hasattr(value, "__len__") or not hasattr(value, "__getitem__") \ 81 or (hasattr(self, "_length") and len(value) != self._length): 82 raise TypeError("We only allow python sequences (lists, tuples or " 83 "strings) of length {0}.".format(self._length)) 84 dict.__setitem__(self, key, value)
85
86 - def update(self, new_dict):
87 # Force this to go via our strict __setitem__ method 88 for (key, value) in new_dict.items(): 89 self[key] = value
90 91
92 -class SeqRecord(object):
93 """A SeqRecord object holds a sequence and information about it. 94 95 Main attributes: 96 - id - Identifier such as a locus tag (string) 97 - seq - The sequence itself (Seq object or similar) 98 99 Additional attributes: 100 - name - Sequence name, e.g. gene name (string) 101 - description - Additional text (string) 102 - dbxrefs - List of database cross references (list of strings) 103 - features - Any (sub)features defined (list of SeqFeature objects) 104 - annotations - Further information about the whole sequence (dictionary). 105 Most entries are strings, or lists of strings. 106 - letter_annotations - Per letter/symbol annotation (restricted 107 dictionary). This holds Python sequences (lists, strings 108 or tuples) whose length matches that of the sequence. 109 A typical use would be to hold a list of integers 110 representing sequencing quality scores, or a string 111 representing the secondary structure. 112 113 You will typically use Bio.SeqIO to read in sequences from files as 114 SeqRecord objects. However, you may want to create your own SeqRecord 115 objects directly (see the __init__ method for further details): 116 117 >>> from Bio.Seq import Seq 118 >>> from Bio.SeqRecord import SeqRecord 119 >>> from Bio.Alphabet import IUPAC 120 >>> record = SeqRecord(Seq("MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF", 121 ... IUPAC.protein), 122 ... id="YP_025292.1", name="HokC", 123 ... description="toxic membrane protein") 124 >>> print(record) 125 ID: YP_025292.1 126 Name: HokC 127 Description: toxic membrane protein 128 Number of features: 0 129 Seq('MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF', IUPACProtein()) 130 131 If you want to save SeqRecord objects to a sequence file, use Bio.SeqIO 132 for this. For the special case where you want the SeqRecord turned into 133 a string in a particular file format there is a format method which uses 134 Bio.SeqIO internally: 135 136 >>> print(record.format("fasta")) 137 >YP_025292.1 toxic membrane protein 138 MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF 139 <BLANKLINE> 140 141 You can also do things like slicing a SeqRecord, checking its length, etc 142 143 >>> len(record) 144 44 145 >>> edited = record[:10] + record[11:] 146 >>> print(edited.seq) 147 MKQHKAMIVAIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF 148 >>> print(record.seq) 149 MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF 150 151 """ 152
153 - def __init__(self, seq, id="<unknown id>", name="<unknown name>", 154 description="<unknown description>", dbxrefs=None, 155 features=None, annotations=None, 156 letter_annotations=None):
157 """Create a SeqRecord. 158 159 Arguments: 160 - seq - Sequence, required (Seq, MutableSeq or UnknownSeq) 161 - id - Sequence identifier, recommended (string) 162 - name - Sequence name, optional (string) 163 - description - Sequence description, optional (string) 164 - dbxrefs - Database cross references, optional (list of strings) 165 - features - Any (sub)features, optional (list of SeqFeature objects) 166 - annotations - Dictionary of annotations for the whole sequence 167 - letter_annotations - Dictionary of per-letter-annotations, values 168 should be strings, list or tuples of the same 169 length as the full sequence. 170 171 You will typically use Bio.SeqIO to read in sequences from files as 172 SeqRecord objects. However, you may want to create your own SeqRecord 173 objects directly. 174 175 Note that while an id is optional, we strongly recommend you supply a 176 unique id string for each record. This is especially important 177 if you wish to write your sequences to a file. 178 179 If you don't have the actual sequence, but you do know its length, 180 then using the UnknownSeq object from Bio.Seq is appropriate. 181 182 You can create a 'blank' SeqRecord object, and then populate the 183 attributes later. 184 """ 185 if id is not None and not isinstance(id, basestring): 186 # Lots of existing code uses id=None... this may be a bad idea. 187 raise TypeError("id argument should be a string") 188 if not isinstance(name, basestring): 189 raise TypeError("name argument should be a string") 190 if not isinstance(description, basestring): 191 raise TypeError("description argument should be a string") 192 self._seq = seq 193 self.id = id 194 self.name = name 195 self.description = description 196 197 # database cross references (for the whole sequence) 198 if dbxrefs is None: 199 dbxrefs = [] 200 elif not isinstance(dbxrefs, list): 201 raise TypeError("dbxrefs argument should be a list (of strings)") 202 self.dbxrefs = dbxrefs 203 204 # annotations about the whole sequence 205 if annotations is None: 206 annotations = {} 207 elif not isinstance(annotations, dict): 208 raise TypeError("annotations argument should be a dict") 209 self.annotations = annotations 210 211 if letter_annotations is None: 212 # annotations about each letter in the sequence 213 if seq is None: 214 # Should we allow this and use a normal unrestricted dict? 215 self._per_letter_annotations = _RestrictedDict(length=0) 216 else: 217 try: 218 self._per_letter_annotations = \ 219 _RestrictedDict(length=len(seq)) 220 except: 221 raise TypeError("seq argument should be a Seq object or similar") 222 else: 223 # This will be handled via the property set function, which will 224 # turn this into a _RestrictedDict and thus ensure all the values 225 # in the dict are the right length 226 self.letter_annotations = letter_annotations 227 228 # annotations about parts of the sequence 229 if features is None: 230 features = [] 231 elif not isinstance(features, list): 232 raise TypeError("features argument should be a list (of SeqFeature objects)") 233 self.features = features
234 235 # TODO - Just make this a read only property?
236 - def _set_per_letter_annotations(self, value):
237 if not isinstance(value, dict): 238 raise TypeError("The per-letter-annotations should be a " 239 "(restricted) dictionary.") 240 # Turn this into a restricted-dictionary (and check the entries) 241 try: 242 self._per_letter_annotations = _RestrictedDict(length=len(self.seq)) 243 except AttributeError: 244 # e.g. seq is None 245 self._per_letter_annotations = _RestrictedDict(length=0) 246 self._per_letter_annotations.update(value)
247 letter_annotations = property( 248 fget=lambda self: self._per_letter_annotations, 249 fset=_set_per_letter_annotations, 250 doc="""Dictionary of per-letter-annotation for the sequence. 251 252 For example, this can hold quality scores used in FASTQ or QUAL files. 253 Consider this example using Bio.SeqIO to read in an example Solexa 254 variant FASTQ file as a SeqRecord: 255 256 >>> from Bio import SeqIO 257 >>> record = SeqIO.read("Quality/solexa_faked.fastq", "fastq-solexa") 258 >>> print("%s %s" % (record.id, record.seq)) 259 slxa_0001_1_0001_01 ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTNNNNNN 260 >>> print(list(record.letter_annotations)) 261 ['solexa_quality'] 262 >>> print(record.letter_annotations["solexa_quality"]) 263 [40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, -1, -2, -3, -4, -5] 264 265 The letter_annotations get sliced automatically if you slice the 266 parent SeqRecord, for example taking the last ten bases: 267 268 >>> sub_record = record[-10:] 269 >>> print("%s %s" % (sub_record.id, sub_record.seq)) 270 slxa_0001_1_0001_01 ACGTNNNNNN 271 >>> print(sub_record.letter_annotations["solexa_quality"]) 272 [4, 3, 2, 1, 0, -1, -2, -3, -4, -5] 273 274 Any python sequence (i.e. list, tuple or string) can be recorded in 275 the SeqRecord's letter_annotations dictionary as long as the length 276 matches that of the SeqRecord's sequence. e.g. 277 278 >>> len(sub_record.letter_annotations) 279 1 280 >>> sub_record.letter_annotations["dummy"] = "abcdefghij" 281 >>> len(sub_record.letter_annotations) 282 2 283 284 You can delete entries from the letter_annotations dictionary as usual: 285 286 >>> del sub_record.letter_annotations["solexa_quality"] 287 >>> sub_record.letter_annotations 288 {'dummy': 'abcdefghij'} 289 290 You can completely clear the dictionary easily as follows: 291 292 >>> sub_record.letter_annotations = {} 293 >>> sub_record.letter_annotations 294 {} 295 296 Note that if replacing the record's sequence with a sequence of a 297 different length you must first clear the letter_annotations dict. 298 """) 299
300 - def _set_seq(self, value):
301 # TODO - Add a deprecation warning that the seq should be write only? 302 if self._per_letter_annotations: 303 if len(self) != len(value): 304 # TODO - Make this a warning? Silently empty the dictionary? 305 raise ValueError("You must empty the letter annotations first!") 306 else: 307 # Leave the existing per letter annotations unchanged: 308 self._seq = value 309 else: 310 self._seq = value 311 # Reset the (empty) letter annotations dict with new length: 312 try: 313 self._per_letter_annotations = _RestrictedDict(length=len(self.seq)) 314 except AttributeError: 315 # e.g. seq is None 316 self._per_letter_annotations = _RestrictedDict(length=0)
317 318 seq = property(fget=lambda self: self._seq, 319 fset=_set_seq, 320 doc="The sequence itself, as a Seq or MutableSeq object.") 321
322 - def __getitem__(self, index):
323 """Returns a sub-sequence or an individual letter. 324 325 Slicing, e.g. my_record[5:10], returns a new SeqRecord for 326 that sub-sequence with appropriate annotation preserved. The 327 name, id and description are kept. 328 329 Any per-letter-annotations are sliced to match the requested 330 sub-sequence. Unless a stride is used, all those features 331 which fall fully within the subsequence are included (with 332 their locations adjusted accordingly). 333 334 However, the annotations dictionary and the dbxrefs list are 335 not used for the new SeqRecord, as in general they may not 336 apply to the subsequence. If you want to preserve them, you 337 must explicitly copy them to the new SeqRecord yourself. 338 339 Using an integer index, e.g. my_record[5] is shorthand for 340 extracting that letter from the sequence, my_record.seq[5]. 341 342 For example, consider this short protein and its secondary 343 structure as encoded by the PDB (e.g. H for alpha helices), 344 plus a simple feature for its histidine self phosphorylation 345 site: 346 347 >>> from Bio.Seq import Seq 348 >>> from Bio.SeqRecord import SeqRecord 349 >>> from Bio.SeqFeature import SeqFeature, FeatureLocation 350 >>> from Bio.Alphabet import IUPAC 351 >>> rec = SeqRecord(Seq("MAAGVKQLADDRTLLMAGVSHDLRTPLTRIRLAT" 352 ... "EMMSEQDGYLAESINKDIEECNAIIEQFIDYLR", 353 ... IUPAC.protein), 354 ... id="1JOY", name="EnvZ", 355 ... description="Homodimeric domain of EnvZ from E. coli") 356 >>> rec.letter_annotations["secondary_structure"] = " S SSSSSSHHHHHTTTHHHHHHHHHHHHHHHHHHHHHHTHHHHHHHHHHHHHHHHHHHHHTT " 357 >>> rec.features.append(SeqFeature(FeatureLocation(20, 21), 358 ... type = "Site")) 359 360 Now let's have a quick look at the full record, 361 362 >>> print(rec) 363 ID: 1JOY 364 Name: EnvZ 365 Description: Homodimeric domain of EnvZ from E. coli 366 Number of features: 1 367 Per letter annotation for: secondary_structure 368 Seq('MAAGVKQLADDRTLLMAGVSHDLRTPLTRIRLATEMMSEQDGYLAESINKDIEE...YLR', IUPACProtein()) 369 >>> rec.letter_annotations["secondary_structure"] 370 ' S SSSSSSHHHHHTTTHHHHHHHHHHHHHHHHHHHHHHTHHHHHHHHHHHHHHHHHHHHHTT ' 371 >>> print(rec.features[0].location) 372 [20:21] 373 374 Now let's take a sub sequence, here chosen as the first (fractured) 375 alpha helix which includes the histidine phosphorylation site: 376 377 >>> sub = rec[11:41] 378 >>> print(sub) 379 ID: 1JOY 380 Name: EnvZ 381 Description: Homodimeric domain of EnvZ from E. coli 382 Number of features: 1 383 Per letter annotation for: secondary_structure 384 Seq('RTLLMAGVSHDLRTPLTRIRLATEMMSEQD', IUPACProtein()) 385 >>> sub.letter_annotations["secondary_structure"] 386 'HHHHHTTTHHHHHHHHHHHHHHHHHHHHHH' 387 >>> print(sub.features[0].location) 388 [9:10] 389 390 You can also of course omit the start or end values, for 391 example to get the first ten letters only: 392 393 >>> print(rec[:10]) 394 ID: 1JOY 395 Name: EnvZ 396 Description: Homodimeric domain of EnvZ from E. coli 397 Number of features: 0 398 Per letter annotation for: secondary_structure 399 Seq('MAAGVKQLAD', IUPACProtein()) 400 401 Or for the last ten letters: 402 403 >>> print(rec[-10:]) 404 ID: 1JOY 405 Name: EnvZ 406 Description: Homodimeric domain of EnvZ from E. coli 407 Number of features: 0 408 Per letter annotation for: secondary_structure 409 Seq('IIEQFIDYLR', IUPACProtein()) 410 411 If you omit both, then you get a copy of the original record (although 412 lacking the annotations and dbxrefs): 413 414 >>> print(rec[:]) 415 ID: 1JOY 416 Name: EnvZ 417 Description: Homodimeric domain of EnvZ from E. coli 418 Number of features: 1 419 Per letter annotation for: secondary_structure 420 Seq('MAAGVKQLADDRTLLMAGVSHDLRTPLTRIRLATEMMSEQDGYLAESINKDIEE...YLR', IUPACProtein()) 421 422 Finally, indexing with a simple integer is shorthand for pulling out 423 that letter from the sequence directly: 424 425 >>> rec[5] 426 'K' 427 >>> rec.seq[5] 428 'K' 429 """ 430 if isinstance(index, int): 431 # NOTE - The sequence level annotation like the id, name, etc 432 # do not really apply to a single character. However, should 433 # we try and expose any per-letter-annotation here? If so how? 434 return self.seq[index] 435 elif isinstance(index, slice): 436 if self.seq is None: 437 raise ValueError("If the sequence is None, we cannot slice it.") 438 parent_length = len(self) 439 from BioSQL.BioSeq import DBSeqRecord 440 if isinstance(self, DBSeqRecord): 441 answer = SeqRecord(self.seq[index], 442 id=self.id, 443 name=self.name, 444 description=self.description) 445 else: 446 answer = self.__class__(self.seq[index], 447 id=self.id, 448 name=self.name, 449 description=self.description) 450 # TODO - The description may no longer apply. 451 # It would be safer to change it to something 452 # generic like "edited" or the default value. 453 454 # Don't copy the annotation dict and dbxefs list, 455 # they may not apply to a subsequence. 456 # answer.annotations = dict(self.annotations.items()) 457 # answer.dbxrefs = self.dbxrefs[:] 458 # TODO - Review this in light of adding SeqRecord objects? 459 460 # TODO - Cope with strides by generating ambiguous locations? 461 start, stop, step = index.indices(parent_length) 462 if step == 1: 463 # Select relevant features, add them with shifted locations 464 # assert str(self.seq)[index] == str(self.seq)[start:stop] 465 for f in self.features: 466 if f.ref or f.ref_db: 467 # TODO - Implement this (with lots of tests)? 468 import warnings 469 warnings.warn("When slicing SeqRecord objects, any " 470 "SeqFeature referencing other sequences (e.g. " 471 "from segmented GenBank records) are ignored.") 472 continue 473 if start <= f.location.nofuzzy_start \ 474 and f.location.nofuzzy_end <= stop: 475 answer.features.append(f._shift(-start)) 476 477 # Slice all the values to match the sliced sequence 478 # (this should also work with strides, even negative strides): 479 for key, value in self.letter_annotations.items(): 480 answer._per_letter_annotations[key] = value[index] 481 482 return answer 483 raise ValueError("Invalid index")
484
485 - def __iter__(self):
486 """Iterate over the letters in the sequence. 487 488 For example, using Bio.SeqIO to read in a protein FASTA file: 489 490 >>> from Bio import SeqIO 491 >>> record = SeqIO.read("Fasta/loveliesbleeding.pro", "fasta") 492 >>> for amino in record: 493 ... print(amino) 494 ... if amino == "L": break 495 X 496 A 497 G 498 L 499 >>> print(record.seq[3]) 500 L 501 502 This is just a shortcut for iterating over the sequence directly: 503 504 >>> for amino in record.seq: 505 ... print(amino) 506 ... if amino == "L": break 507 X 508 A 509 G 510 L 511 >>> print(record.seq[3]) 512 L 513 514 Note that this does not facilitate iteration together with any 515 per-letter-annotation. However, you can achieve that using the 516 python zip function on the record (or its sequence) and the relevant 517 per-letter-annotation: 518 519 >>> from Bio import SeqIO 520 >>> rec = SeqIO.read("Quality/solexa_faked.fastq", "fastq-solexa") 521 >>> print("%s %s" % (rec.id, rec.seq)) 522 slxa_0001_1_0001_01 ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTNNNNNN 523 >>> print(list(rec.letter_annotations)) 524 ['solexa_quality'] 525 >>> for nuc, qual in zip(rec, rec.letter_annotations["solexa_quality"]): 526 ... if qual > 35: 527 ... print("%s %i" % (nuc, qual)) 528 A 40 529 C 39 530 G 38 531 T 37 532 A 36 533 534 You may agree that using zip(rec.seq, ...) is more explicit than using 535 zip(rec, ...) as shown above. 536 """ 537 return iter(self.seq)
538
539 - def __contains__(self, char):
540 """Implements the 'in' keyword, searches the sequence. 541 542 e.g. 543 544 >>> from Bio import SeqIO 545 >>> record = SeqIO.read("Fasta/sweetpea.nu", "fasta") 546 >>> "GAATTC" in record 547 False 548 >>> "AAA" in record 549 True 550 551 This essentially acts as a proxy for using "in" on the sequence: 552 553 >>> "GAATTC" in record.seq 554 False 555 >>> "AAA" in record.seq 556 True 557 558 Note that you can also use Seq objects as the query, 559 560 >>> from Bio.Seq import Seq 561 >>> from Bio.Alphabet import generic_dna 562 >>> Seq("AAA") in record 563 True 564 >>> Seq("AAA", generic_dna) in record 565 True 566 567 See also the Seq object's __contains__ method. 568 """ 569 return char in self.seq
570
571 - def __str__(self):
572 """A human readable summary of the record and its annotation (string). 573 574 The python built in function str works by calling the object's ___str__ 575 method. e.g. 576 577 >>> from Bio.Seq import Seq 578 >>> from Bio.SeqRecord import SeqRecord 579 >>> from Bio.Alphabet import IUPAC 580 >>> record = SeqRecord(Seq("MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF", 581 ... IUPAC.protein), 582 ... id="YP_025292.1", name="HokC", 583 ... description="toxic membrane protein, small") 584 >>> print(str(record)) 585 ID: YP_025292.1 586 Name: HokC 587 Description: toxic membrane protein, small 588 Number of features: 0 589 Seq('MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF', IUPACProtein()) 590 591 In this example you don't actually need to call str explicity, as the 592 print command does this automatically: 593 594 >>> print(record) 595 ID: YP_025292.1 596 Name: HokC 597 Description: toxic membrane protein, small 598 Number of features: 0 599 Seq('MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF', IUPACProtein()) 600 601 Note that long sequences are shown truncated. 602 """ 603 lines = [] 604 if self.id: 605 lines.append("ID: {0}".format(self.id)) 606 if self.name: 607 lines.append("Name: {0}".format(self.name)) 608 if self.description: 609 lines.append("Description: {0}".format(self.description)) 610 if self.dbxrefs: 611 lines.append("Database cross-references: " + ", ".join(self.dbxrefs)) 612 lines.append("Number of features: {0}".format(len(self.features))) 613 for a in self.annotations: 614 lines.append("/{0}={1}".format(a, str(self.annotations[a]))) 615 if self.letter_annotations: 616 lines.append("Per letter annotation for: " + ", ".join(self.letter_annotations)) 617 # Don't want to include the entire sequence, 618 # and showing the alphabet is useful: 619 lines.append(repr(self.seq)) 620 return "\n".join(lines)
621
622 - def __repr__(self):
623 """A concise summary of the record for debugging (string). 624 625 The python built in function repr works by calling the object's ___repr__ 626 method. e.g. 627 628 >>> from Bio.Seq import Seq 629 >>> from Bio.SeqRecord import SeqRecord 630 >>> from Bio.Alphabet import generic_protein 631 >>> rec = SeqRecord(Seq("MASRGVNKVILVGNLGQDPEVRYMPNGGAVANITLATSESWRDKAT" 632 ... +"GEMKEQTEWHRVVLFGKLAEVASEYLRKGSQVYIEGQLRTRKWTDQ" 633 ... +"SGQDRYTTEVVVNVGGTMQMLGGRQGGGAPAGGNIGGGQPQGGWGQ" 634 ... +"PQQPQGGNQFSGGAQSRPQQSAPAAPSNEPPMDFDDDIPF", 635 ... generic_protein), 636 ... id="NP_418483.1", name="b4059", 637 ... description="ssDNA-binding protein", 638 ... dbxrefs=["ASAP:13298", "GI:16131885", "GeneID:948570"]) 639 >>> print(repr(rec)) 640 SeqRecord(seq=Seq('MASRGVNKVILVGNLGQDPEVRYMPNGGAVANITLATSESWRDKATGEMKEQTE...IPF', ProteinAlphabet()), id='NP_418483.1', name='b4059', description='ssDNA-binding protein', dbxrefs=['ASAP:13298', 'GI:16131885', 'GeneID:948570']) 641 642 At the python prompt you can also use this shorthand: 643 644 >>> rec 645 SeqRecord(seq=Seq('MASRGVNKVILVGNLGQDPEVRYMPNGGAVANITLATSESWRDKATGEMKEQTE...IPF', ProteinAlphabet()), id='NP_418483.1', name='b4059', description='ssDNA-binding protein', dbxrefs=['ASAP:13298', 'GI:16131885', 'GeneID:948570']) 646 647 Note that long sequences are shown truncated. Also note that any 648 annotations, letter_annotations and features are not shown (as they 649 would lead to a very long string). 650 """ 651 return "{0}(seq={1!r}, id={2!r}, name={3!r}, description={4!r}, dbxrefs={5!r})".format( 652 self.__class__.__name__, 653 self.seq, self.id, self.name, 654 self.description, self.dbxrefs)
655
656 - def format(self, format):
657 r"""Returns the record as a string in the specified file format. 658 659 The format should be a lower case string supported as an output 660 format by Bio.SeqIO, which is used to turn the SeqRecord into a 661 string. e.g. 662 663 >>> from Bio.Seq import Seq 664 >>> from Bio.SeqRecord import SeqRecord 665 >>> from Bio.Alphabet import IUPAC 666 >>> record = SeqRecord(Seq("MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF", 667 ... IUPAC.protein), 668 ... id="YP_025292.1", name="HokC", 669 ... description="toxic membrane protein") 670 >>> record.format("fasta") 671 '>YP_025292.1 toxic membrane protein\nMKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF\n' 672 >>> print(record.format("fasta")) 673 >YP_025292.1 toxic membrane protein 674 MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF 675 <BLANKLINE> 676 677 The python print command automatically appends a new line, meaning 678 in this example a blank line is shown. If you look at the string 679 representation you can see there is a trailing new line (shown as 680 slash n) which is important when writing to a file or if 681 concatenating multiple sequence strings together. 682 683 Note that this method will NOT work on every possible file format 684 supported by Bio.SeqIO (e.g. some are for multiple sequences only). 685 """ 686 # See also the __format__ added for Python 2.6 / 3.0, PEP 3101 687 # See also the Bio.Align.Generic.Alignment class and its format() 688 return self.__format__(format)
689
690 - def __format__(self, format_spec):
691 """Returns the record as a string in the specified file format. 692 693 This method supports the python format() function added in 694 Python 2.6/3.0. The format_spec should be a lower case string 695 supported by Bio.SeqIO as an output file format. See also the 696 SeqRecord's format() method. 697 698 Under Python 3 please note that for binary formats a bytes 699 string is returned, otherwise a (unicode) string is returned. 700 """ 701 if not format_spec: 702 # Follow python convention and default to using __str__ 703 return str(self) 704 from Bio import SeqIO 705 if format_spec in SeqIO._BinaryFormats: 706 # Return bytes on Python 3 707 from io import BytesIO 708 handle = BytesIO() 709 else: 710 from Bio._py3k import StringIO 711 handle = StringIO() 712 SeqIO.write(self, handle, format_spec) 713 return handle.getvalue()
714
715 - def __len__(self):
716 """Returns the length of the sequence. 717 718 For example, using Bio.SeqIO to read in a FASTA nucleotide file: 719 720 >>> from Bio import SeqIO 721 >>> record = SeqIO.read("Fasta/sweetpea.nu", "fasta") 722 >>> len(record) 723 309 724 >>> len(record.seq) 725 309 726 """ 727 return len(self.seq)
728
729 - def __lt__(self, other):
730 raise NotImplementedError(_NO_SEQRECORD_COMPARISON)
731
732 - def __le___(self, other):
733 raise NotImplementedError(_NO_SEQRECORD_COMPARISON)
734
735 - def __eq__(self, other):
736 raise NotImplementedError(_NO_SEQRECORD_COMPARISON)
737
738 - def __ne__(self, other):
739 raise NotImplementedError(_NO_SEQRECORD_COMPARISON)
740
741 - def __gt__(self, other):
742 raise NotImplementedError(_NO_SEQRECORD_COMPARISON)
743
744 - def __ge__(self, other):
745 raise NotImplementedError(_NO_SEQRECORD_COMPARISON)
746 747 # Note Python 3 does not use __cmp__ and there is no need to 748 # define __cmp__ on Python 2 as have all of _lt__ etc defined. 749 750 # Python 3:
751 - def __bool__(self):
752 """Boolean value of an instance of this class (True). 753 754 This behaviour is for backwards compatibility, since until the 755 __len__ method was added, a SeqRecord always evaluated as True. 756 757 Note that in comparison, a Seq object will evaluate to False if it 758 has a zero length sequence. 759 760 WARNING: The SeqRecord may in future evaluate to False when its 761 sequence is of zero length (in order to better match the Seq 762 object behaviour)! 763 """ 764 return True
765 766 # Python 2: 767 __nonzero__ = __bool__ 768
769 - def __add__(self, other):
770 """Add another sequence or string to this sequence. 771 772 The other sequence can be a SeqRecord object, a Seq object (or 773 similar, e.g. a MutableSeq) or a plain Python string. If you add 774 a plain string or a Seq (like) object, the new SeqRecord will simply 775 have this appended to the existing data. However, any per letter 776 annotation will be lost: 777 778 >>> from Bio import SeqIO 779 >>> record = SeqIO.read("Quality/solexa_faked.fastq", "fastq-solexa") 780 >>> print("%s %s" % (record.id, record.seq)) 781 slxa_0001_1_0001_01 ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTNNNNNN 782 >>> print(list(record.letter_annotations)) 783 ['solexa_quality'] 784 785 >>> new = record + "ACT" 786 >>> print("%s %s" % (new.id, new.seq)) 787 slxa_0001_1_0001_01 ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTNNNNNNACT 788 >>> print(list(new.letter_annotations)) 789 [] 790 791 The new record will attempt to combine the annotation, but for any 792 ambiguities (e.g. different names) it defaults to omitting that 793 annotation. 794 795 >>> from Bio import SeqIO 796 >>> with open("GenBank/pBAD30.gb") as handle: 797 ... plasmid = SeqIO.read(handle, "gb") 798 >>> print("%s %i" % (plasmid.id, len(plasmid))) 799 pBAD30 4923 800 801 Now let's cut the plasmid into two pieces, and join them back up the 802 other way round (i.e. shift the starting point on this plasmid, have 803 a look at the annotated features in the original file to see why this 804 particular split point might make sense): 805 806 >>> left = plasmid[:3765] 807 >>> right = plasmid[3765:] 808 >>> new = right + left 809 >>> print("%s %i" % (new.id, len(new))) 810 pBAD30 4923 811 >>> str(new.seq) == str(right.seq + left.seq) 812 True 813 >>> len(new.features) == len(left.features) + len(right.features) 814 True 815 816 When we add the left and right SeqRecord objects, their annotation 817 is all consistent, so it is all conserved in the new SeqRecord: 818 819 >>> new.id == left.id == right.id == plasmid.id 820 True 821 >>> new.name == left.name == right.name == plasmid.name 822 True 823 >>> new.description == plasmid.description 824 True 825 >>> new.annotations == left.annotations == right.annotations 826 True 827 >>> new.letter_annotations == plasmid.letter_annotations 828 True 829 >>> new.dbxrefs == left.dbxrefs == right.dbxrefs 830 True 831 832 However, we should point out that when we sliced the SeqRecord, 833 any annotations dictionary or dbxrefs list entries were lost. 834 You can explicitly copy them like this: 835 836 >>> new.annotations = plasmid.annotations.copy() 837 >>> new.dbxrefs = plasmid.dbxrefs[:] 838 """ 839 if not isinstance(other, SeqRecord): 840 # Assume it is a string or a Seq. 841 # Note can't transfer any per-letter-annotations 842 return SeqRecord(self.seq + other, 843 id=self.id, name=self.name, 844 description=self.description, 845 features=self.features[:], 846 annotations=self.annotations.copy(), 847 dbxrefs=self.dbxrefs[:]) 848 # Adding two SeqRecord objects... must merge annotation. 849 answer = SeqRecord(self.seq + other.seq, 850 features=self.features[:], 851 dbxrefs=self.dbxrefs[:]) 852 # Will take all the features and all the db cross refs, 853 l = len(self) 854 for f in other.features: 855 answer.features.append(f._shift(l)) 856 del l 857 for ref in other.dbxrefs: 858 if ref not in answer.dbxrefs: 859 answer.dbxrefs.append(ref) 860 # Take common id/name/description/annotation 861 if self.id == other.id: 862 answer.id = self.id 863 if self.name == other.name: 864 answer.name = self.name 865 if self.description == other.description: 866 answer.description = self.description 867 for k, v in self.annotations.items(): 868 if k in other.annotations and other.annotations[k] == v: 869 answer.annotations[k] = v 870 # Can append matching per-letter-annotation 871 for k, v in self.letter_annotations.items(): 872 if k in other.letter_annotations: 873 answer.letter_annotations[k] = v + other.letter_annotations[k] 874 return answer
875
876 - def __radd__(self, other):
877 """Add another sequence or string to this sequence (from the left). 878 879 This method handles adding a Seq object (or similar, e.g. MutableSeq) 880 or a plain Python string (on the left) to a SeqRecord (on the right). 881 See the __add__ method for more details, but for example: 882 883 >>> from Bio import SeqIO 884 >>> record = SeqIO.read("Quality/solexa_faked.fastq", "fastq-solexa") 885 >>> print("%s %s" % (record.id, record.seq)) 886 slxa_0001_1_0001_01 ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTNNNNNN 887 >>> print(list(record.letter_annotations)) 888 ['solexa_quality'] 889 890 >>> new = "ACT" + record 891 >>> print("%s %s" % (new.id, new.seq)) 892 slxa_0001_1_0001_01 ACTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTNNNNNN 893 >>> print(list(new.letter_annotations)) 894 [] 895 """ 896 if isinstance(other, SeqRecord): 897 raise RuntimeError("This should have happened via the __add__ of " 898 "the other SeqRecord being added!") 899 # Assume it is a string or a Seq. 900 # Note can't transfer any per-letter-annotations 901 offset = len(other) 902 return SeqRecord(other + self.seq, 903 id=self.id, name=self.name, 904 description=self.description, 905 features=[f._shift(offset) for f in self.features], 906 annotations=self.annotations.copy(), 907 dbxrefs=self.dbxrefs[:])
908
909 - def upper(self):
910 """Returns a copy of the record with an upper case sequence. 911 912 All the annotation is preserved unchanged. e.g. 913 914 >>> from Bio.Alphabet import generic_dna 915 >>> from Bio.Seq import Seq 916 >>> from Bio.SeqRecord import SeqRecord 917 >>> record = SeqRecord(Seq("acgtACGT", generic_dna), id="Test", 918 ... description = "Made up for this example") 919 >>> record.letter_annotations["phred_quality"] = [1, 2, 3, 4, 5, 6, 7, 8] 920 >>> print(record.upper().format("fastq")) 921 @Test Made up for this example 922 ACGTACGT 923 + 924 "#$%&'() 925 <BLANKLINE> 926 927 Naturally, there is a matching lower method: 928 929 >>> print(record.lower().format("fastq")) 930 @Test Made up for this example 931 acgtacgt 932 + 933 "#$%&'() 934 <BLANKLINE> 935 """ 936 return SeqRecord(self.seq.upper(), 937 id=self.id, name=self.name, 938 description=self.description, 939 dbxrefs=self.dbxrefs[:], 940 features=self.features[:], 941 annotations=self.annotations.copy(), 942 letter_annotations=self.letter_annotations.copy())
943
944 - def lower(self):
945 """Returns a copy of the record with a lower case sequence. 946 947 All the annotation is preserved unchanged. e.g. 948 949 >>> from Bio import SeqIO 950 >>> record = SeqIO.read("Fasta/aster.pro", "fasta") 951 >>> print(record.format("fasta")) 952 >gi|3298468|dbj|BAA31520.1| SAMIPF 953 GGHVNPAVTFGAFVGGNITLLRGIVYIIAQLLGSTVACLLLKFVTNDMAVGVFSLSAGVG 954 VTNALVFEIVMTFGLVYTVYATAIDPKKGSLGTIAPIAIGFIVGANI 955 <BLANKLINE> 956 >>> print(record.lower().format("fasta")) 957 >gi|3298468|dbj|BAA31520.1| SAMIPF 958 gghvnpavtfgafvggnitllrgivyiiaqllgstvaclllkfvtndmavgvfslsagvg 959 vtnalvfeivmtfglvytvyataidpkkgslgtiapiaigfivgani 960 <BLANKLINE> 961 962 To take a more annotation rich example, 963 964 >>> from Bio import SeqIO 965 >>> old = SeqIO.read("EMBL/TRBG361.embl", "embl") 966 >>> len(old.features) 967 3 968 >>> new = old.lower() 969 >>> len(old.features) == len(new.features) 970 True 971 >>> old.annotations["organism"] == new.annotations["organism"] 972 True 973 >>> old.dbxrefs == new.dbxrefs 974 True 975 """ 976 return SeqRecord(self.seq.lower(), 977 id=self.id, name=self.name, 978 description=self.description, 979 dbxrefs=self.dbxrefs[:], 980 features=self.features[:], 981 annotations=self.annotations.copy(), 982 letter_annotations=self.letter_annotations.copy())
983
984 - def reverse_complement(self, id=False, name=False, description=False, 985 features=True, annotations=False, 986 letter_annotations=True, dbxrefs=False):
987 """Returns new SeqRecord with reverse complement sequence. 988 989 You can specify the returned record's id, name and description as 990 strings, or True to keep that of the parent, or False for a default. 991 992 You can specify the returned record's features with a list of 993 SeqFeature objects, or True to keep that of the parent, or False to 994 omit them. The default is to keep the original features (with the 995 strand and locations adjusted). 996 997 You can also specify both the returned record's annotations and 998 letter_annotations as dictionaries, True to keep that of the parent, 999 or False to omit them. The default is to keep the original 1000 annotations (with the letter annotations reversed). 1001 1002 To show what happens to the pre-letter annotations, consider an 1003 example Solexa variant FASTQ file with a single entry, which we'll 1004 read in as a SeqRecord: 1005 1006 >>> from Bio import SeqIO 1007 >>> record = SeqIO.read("Quality/solexa_faked.fastq", "fastq-solexa") 1008 >>> print("%s %s" % (record.id, record.seq)) 1009 slxa_0001_1_0001_01 ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTNNNNNN 1010 >>> print(list(record.letter_annotations)) 1011 ['solexa_quality'] 1012 >>> print(record.letter_annotations["solexa_quality"]) 1013 [40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, -1, -2, -3, -4, -5] 1014 1015 Now take the reverse complement, 1016 1017 >>> rc_record = record.reverse_complement(id=record.id+"_rc") 1018 >>> print("%s %s" % (rc_record.id, rc_record.seq)) 1019 slxa_0001_1_0001_01_rc NNNNNNACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT 1020 1021 Notice that the per-letter-annotations have also been reversed, 1022 although this may not be appropriate for all cases. 1023 1024 >>> print(rc_record.letter_annotations["solexa_quality"]) 1025 [-5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40] 1026 1027 Now for the features, we need a different example. Parsing a GenBank 1028 file is probably the easiest way to get an nice example with features 1029 in it... 1030 1031 >>> from Bio import SeqIO 1032 >>> with open("GenBank/pBAD30.gb") as handle: 1033 ... plasmid = SeqIO.read(handle, "gb") 1034 >>> print("%s %i" % (plasmid.id, len(plasmid))) 1035 pBAD30 4923 1036 >>> plasmid.seq 1037 Seq('GCTAGCGGAGTGTATACTGGCTTACTATGTTGGCACTGATGAGGGTGTCAGTGA...ATG', IUPACAmbiguousDNA()) 1038 >>> len(plasmid.features) 1039 13 1040 1041 Now, let's take the reverse complement of this whole plasmid: 1042 1043 >>> rc_plasmid = plasmid.reverse_complement(id=plasmid.id+"_rc") 1044 >>> print("%s %i" % (rc_plasmid.id, len(rc_plasmid))) 1045 pBAD30_rc 4923 1046 >>> rc_plasmid.seq 1047 Seq('CATGGGCAAATATTATACGCAAGGCGACAAGGTGCTGATGCCGCTGGCGATTCA...AGC', IUPACAmbiguousDNA()) 1048 >>> len(rc_plasmid.features) 1049 13 1050 1051 Let's compare the first CDS feature - it has gone from being the 1052 second feature (index 1) to the second last feature (index -2), its 1053 strand has changed, and the location switched round. 1054 1055 >>> print(plasmid.features[1]) 1056 type: CDS 1057 location: [1081:1960](-) 1058 qualifiers: 1059 Key: label, Value: ['araC'] 1060 Key: note, Value: ['araC regulator of the arabinose BAD promoter'] 1061 Key: vntifkey, Value: ['4'] 1062 <BLANKLINE> 1063 >>> print(rc_plasmid.features[-2]) 1064 type: CDS 1065 location: [2963:3842](+) 1066 qualifiers: 1067 Key: label, Value: ['araC'] 1068 Key: note, Value: ['araC regulator of the arabinose BAD promoter'] 1069 Key: vntifkey, Value: ['4'] 1070 <BLANKLINE> 1071 1072 You can check this new location, based on the length of the plasmid: 1073 1074 >>> len(plasmid) - 1081 1075 3842 1076 >>> len(plasmid) - 1960 1077 2963 1078 1079 Note that if the SeqFeature annotation includes any strand specific 1080 information (e.g. base changes for a SNP), this information is not 1081 amended, and would need correction after the reverse complement. 1082 1083 Note trying to reverse complement a protein SeqRecord raises an 1084 exception: 1085 1086 >>> from Bio.SeqRecord import SeqRecord 1087 >>> from Bio.Seq import Seq 1088 >>> from Bio.Alphabet import IUPAC 1089 >>> protein_rec = SeqRecord(Seq("MAIVMGR", IUPAC.protein), id="Test") 1090 >>> protein_rec.reverse_complement() 1091 Traceback (most recent call last): 1092 ... 1093 ValueError: Proteins do not have complements! 1094 1095 Also note you can reverse complement a SeqRecord using a MutableSeq: 1096 1097 >>> from Bio.SeqRecord import SeqRecord 1098 >>> from Bio.Seq import MutableSeq 1099 >>> from Bio.Alphabet import generic_dna 1100 >>> rec = SeqRecord(MutableSeq("ACGT", generic_dna), id="Test") 1101 >>> rec.seq[0] = "T" 1102 >>> print("%s %s" % (rec.id, rec.seq)) 1103 Test TCGT 1104 >>> rc = rec.reverse_complement(id=True) 1105 >>> print("%s %s" % (rc.id, rc.seq)) 1106 Test ACGA 1107 """ 1108 from Bio.Seq import MutableSeq # Lazy to avoid circular imports 1109 if isinstance(self.seq, MutableSeq): 1110 # Currently the MutableSeq reverse complement is in situ 1111 answer = SeqRecord(self.seq.toseq().reverse_complement()) 1112 else: 1113 answer = SeqRecord(self.seq.reverse_complement()) 1114 if isinstance(id, basestring): 1115 answer.id = id 1116 elif id: 1117 answer.id = self.id 1118 if isinstance(name, basestring): 1119 answer.name = name 1120 elif name: 1121 answer.name = self.name 1122 if isinstance(description, basestring): 1123 answer.description = description 1124 elif description: 1125 answer.description = self.description 1126 if isinstance(dbxrefs, list): 1127 answer.dbxrefs = dbxrefs 1128 elif dbxrefs: 1129 # Copy the old dbxrefs 1130 answer.dbxrefs = self.dbxrefs[:] 1131 if isinstance(features, list): 1132 answer.features = features 1133 elif features: 1134 # Copy the old features, adjusting location and string 1135 l = len(answer) 1136 answer.features = [f._flip(l) for f in self.features] 1137 # The old list should have been sorted by start location, 1138 # reversing it will leave it sorted by what is now the end position, 1139 # so we need to resort in case of overlapping features. 1140 # NOTE - In the common case of gene before CDS (and similar) with 1141 # the exact same locations, this will still maintain gene before CDS 1142 answer.features.sort(key=lambda x: x.location.start.position) 1143 if isinstance(annotations, dict): 1144 answer.annotations = annotations 1145 elif annotations: 1146 # Copy the old annotations, 1147 answer.annotations = self.annotations.copy() 1148 if isinstance(letter_annotations, dict): 1149 answer.letter_annotations = letter_annotations 1150 elif letter_annotations: 1151 # Copy the old per letter annotations, reversing them 1152 for key, value in self.letter_annotations.items(): 1153 answer._per_letter_annotations[key] = value[::-1] 1154 return answer
1155 1156 1157 if __name__ == "__main__": 1158 from Bio._utils import run_doctest 1159 run_doctest() 1160