Package Bio :: Module SeqRecord
[hide private]
[frames] | no frames]

Source Code for Module Bio.SeqRecord

   1  # Copyright 2000-2002 Andrew Dalke. 
   2  # Copyright 2002-2004 Brad Chapman. 
   3  # Copyright 2006-2010 by Peter Cock. 
   4  # All rights reserved. 
   5  # This code is part of the Biopython distribution and governed by its 
   6  # license.  Please see the LICENSE file that should have been included 
   7  # as part of this package. 
   8  """Represent a Sequence Record, a sequence with annotation.""" 
   9   
  10   
  11  from Bio._py3k import basestring 
  12   
  13  # NEEDS TO BE SYNCH WITH THE REST OF BIOPYTHON AND BIOPERL 
  14  # In particular, the SeqRecord and BioSQL.BioSeq.DBSeqRecord classes 
  15  # need to be in sync (this is the BioSQL "Database SeqRecord", see 
  16  # also BioSQL.BioSeq.DBSeq which is the "Database Seq" class) 
  17   
  18  _NO_SEQRECORD_COMPARISON = "SeqRecord comparison is deliberately not implemented. Explicitly compare the attributes of interest." 
  19   
  20   
21 -class _RestrictedDict(dict):
22 """Dict which only allows sequences of given length as values (PRIVATE). 23 24 This simple subclass of the Python dictionary is used in the SeqRecord 25 object for holding per-letter-annotations. This class is intended to 26 prevent simple errors by only allowing python sequences (e.g. lists, 27 strings and tuples) to be stored, and only if their length matches that 28 expected (the length of the SeqRecord's seq object). It cannot however 29 prevent the entries being edited in situ (for example appending entries 30 to a list). 31 32 >>> x = _RestrictedDict(5) 33 >>> x["test"] = "hello" 34 >>> x 35 {'test': 'hello'} 36 37 Adding entries which don't have the expected length are blocked: 38 39 >>> x["test"] = "hello world" 40 Traceback (most recent call last): 41 ... 42 TypeError: We only allow python sequences (lists, tuples or strings) of length 5. 43 44 The expected length is stored as a private attribute, 45 46 >>> x._length 47 5 48 49 In order that the SeqRecord (and other objects using this class) can be 50 pickled, for example for use in the multiprocessing library, we need to 51 be able to pickle the restricted dictionary objects. 52 53 Using the default protocol, which is 0 on Python 2.x, 54 55 >>> import pickle 56 >>> y = pickle.loads(pickle.dumps(x)) 57 >>> y 58 {'test': 'hello'} 59 >>> y._length 60 5 61 62 Using the highest protocol, which is 2 on Python 2.x, 63 64 >>> import pickle 65 >>> z = pickle.loads(pickle.dumps(x, pickle.HIGHEST_PROTOCOL)) 66 >>> z 67 {'test': 'hello'} 68 >>> z._length 69 5 70 """ 71
72 - def __init__(self, length):
73 """Create an EMPTY restricted dictionary.""" 74 dict.__init__(self) 75 self._length = int(length)
76
77 - def __setitem__(self, key, value):
78 # The check hasattr(self, "_length") is to cope with pickle protocol 2 79 # I couldn't seem to avoid this with __getstate__ and __setstate__ 80 if not hasattr(value, "__len__") or not hasattr(value, "__getitem__") \ 81 or (hasattr(self, "_length") and len(value) != self._length): 82 raise TypeError("We only allow python sequences (lists, tuples or " 83 "strings) of length {0}.".format(self._length)) 84 dict.__setitem__(self, key, value)
85
86 - def update(self, new_dict):
87 # Force this to go via our strict __setitem__ method 88 for (key, value) in new_dict.items(): 89 self[key] = value
90 91
92 -class SeqRecord(object):
93 """A SeqRecord object holds a sequence and information about it. 94 95 Main attributes: 96 - id - Identifier such as a locus tag (string) 97 - seq - The sequence itself (Seq object or similar) 98 99 Additional attributes: 100 - name - Sequence name, e.g. gene name (string) 101 - description - Additional text (string) 102 - dbxrefs - List of database cross references (list of strings) 103 - features - Any (sub)features defined (list of SeqFeature objects) 104 - annotations - Further information about the whole sequence (dictionary). 105 Most entries are strings, or lists of strings. 106 - letter_annotations - Per letter/symbol annotation (restricted 107 dictionary). This holds Python sequences (lists, strings 108 or tuples) whose length matches that of the sequence. 109 A typical use would be to hold a list of integers 110 representing sequencing quality scores, or a string 111 representing the secondary structure. 112 113 You will typically use Bio.SeqIO to read in sequences from files as 114 SeqRecord objects. However, you may want to create your own SeqRecord 115 objects directly (see the __init__ method for further details): 116 117 >>> from Bio.Seq import Seq 118 >>> from Bio.SeqRecord import SeqRecord 119 >>> from Bio.Alphabet import IUPAC 120 >>> record = SeqRecord(Seq("MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF", 121 ... IUPAC.protein), 122 ... id="YP_025292.1", name="HokC", 123 ... description="toxic membrane protein") 124 >>> print(record) 125 ID: YP_025292.1 126 Name: HokC 127 Description: toxic membrane protein 128 Number of features: 0 129 Seq('MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF', IUPACProtein()) 130 131 If you want to save SeqRecord objects to a sequence file, use Bio.SeqIO 132 for this. For the special case where you want the SeqRecord turned into 133 a string in a particular file format there is a format method which uses 134 Bio.SeqIO internally: 135 136 >>> print(record.format("fasta")) 137 >YP_025292.1 toxic membrane protein 138 MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF 139 <BLANKLINE> 140 141 You can also do things like slicing a SeqRecord, checking its length, etc 142 143 >>> len(record) 144 44 145 >>> edited = record[:10] + record[11:] 146 >>> print(edited.seq) 147 MKQHKAMIVAIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF 148 >>> print(record.seq) 149 MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF 150 151 """ 152
153 - def __init__(self, seq, id="<unknown id>", name="<unknown name>", 154 description="<unknown description>", dbxrefs=None, 155 features=None, annotations=None, 156 letter_annotations=None):
157 """Create a SeqRecord. 158 159 Arguments: 160 - seq - Sequence, required (Seq, MutableSeq or UnknownSeq) 161 - id - Sequence identifier, recommended (string) 162 - name - Sequence name, optional (string) 163 - description - Sequence description, optional (string) 164 - dbxrefs - Database cross references, optional (list of strings) 165 - features - Any (sub)features, optional (list of SeqFeature objects) 166 - annotations - Dictionary of annotations for the whole sequence 167 - letter_annotations - Dictionary of per-letter-annotations, values 168 should be strings, list or tuples of the same 169 length as the full sequence. 170 171 You will typically use Bio.SeqIO to read in sequences from files as 172 SeqRecord objects. However, you may want to create your own SeqRecord 173 objects directly. 174 175 Note that while an id is optional, we strongly recommend you supply a 176 unique id string for each record. This is especially important 177 if you wish to write your sequences to a file. 178 179 If you don't have the actual sequence, but you do know its length, 180 then using the UnknownSeq object from Bio.Seq is appropriate. 181 182 You can create a 'blank' SeqRecord object, and then populate the 183 attributes later. 184 """ 185 if id is not None and not isinstance(id, basestring): 186 # Lots of existing code uses id=None... this may be a bad idea. 187 raise TypeError("id argument should be a string") 188 if not isinstance(name, basestring): 189 raise TypeError("name argument should be a string") 190 if not isinstance(description, basestring): 191 raise TypeError("description argument should be a string") 192 self._seq = seq 193 self.id = id 194 self.name = name 195 self.description = description 196 197 # database cross references (for the whole sequence) 198 if dbxrefs is None: 199 dbxrefs = [] 200 elif not isinstance(dbxrefs, list): 201 raise TypeError("dbxrefs argument should be a list (of strings)") 202 self.dbxrefs = dbxrefs 203 204 # annotations about the whole sequence 205 if annotations is None: 206 annotations = {} 207 elif not isinstance(annotations, dict): 208 raise TypeError("annotations argument should be a dict") 209 self.annotations = annotations 210 211 if letter_annotations is None: 212 # annotations about each letter in the sequence 213 if seq is None: 214 # Should we allow this and use a normal unrestricted dict? 215 self._per_letter_annotations = _RestrictedDict(length=0) 216 else: 217 try: 218 self._per_letter_annotations = \ 219 _RestrictedDict(length=len(seq)) 220 except: 221 raise TypeError("seq argument should be a Seq object or similar") 222 else: 223 # This will be handled via the property set function, which will 224 # turn this into a _RestrictedDict and thus ensure all the values 225 # in the dict are the right length 226 self.letter_annotations = letter_annotations 227 228 # annotations about parts of the sequence 229 if features is None: 230 features = [] 231 elif not isinstance(features, list): 232 raise TypeError("features argument should be a list (of SeqFeature objects)") 233 self.features = features
234 235 # TODO - Just make this a read only property?
236 - def _set_per_letter_annotations(self, value):
237 if not isinstance(value, dict): 238 raise TypeError("The per-letter-annotations should be a " 239 "(restricted) dictionary.") 240 # Turn this into a restricted-dictionary (and check the entries) 241 try: 242 self._per_letter_annotations = _RestrictedDict(length=len(self.seq)) 243 except AttributeError: 244 # e.g. seq is None 245 self._per_letter_annotations = _RestrictedDict(length=0) 246 self._per_letter_annotations.update(value)
247 letter_annotations = property( 248 fget=lambda self: self._per_letter_annotations, 249 fset=_set_per_letter_annotations, 250 doc="""Dictionary of per-letter-annotation for the sequence. 251 252 For example, this can hold quality scores used in FASTQ or QUAL files. 253 Consider this example using Bio.SeqIO to read in an example Solexa 254 variant FASTQ file as a SeqRecord: 255 256 >>> from Bio import SeqIO 257 >>> record = SeqIO.read("Quality/solexa_faked.fastq", "fastq-solexa") 258 >>> print("%s %s" % (record.id, record.seq)) 259 slxa_0001_1_0001_01 ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTNNNNNN 260 >>> print(list(record.letter_annotations)) 261 ['solexa_quality'] 262 >>> print(record.letter_annotations["solexa_quality"]) 263 [40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, -1, -2, -3, -4, -5] 264 265 The letter_annotations get sliced automatically if you slice the 266 parent SeqRecord, for example taking the last ten bases: 267 268 >>> sub_record = record[-10:] 269 >>> print("%s %s" % (sub_record.id, sub_record.seq)) 270 slxa_0001_1_0001_01 ACGTNNNNNN 271 >>> print(sub_record.letter_annotations["solexa_quality"]) 272 [4, 3, 2, 1, 0, -1, -2, -3, -4, -5] 273 274 Any python sequence (i.e. list, tuple or string) can be recorded in 275 the SeqRecord's letter_annotations dictionary as long as the length 276 matches that of the SeqRecord's sequence. e.g. 277 278 >>> len(sub_record.letter_annotations) 279 1 280 >>> sub_record.letter_annotations["dummy"] = "abcdefghij" 281 >>> len(sub_record.letter_annotations) 282 2 283 284 You can delete entries from the letter_annotations dictionary as usual: 285 286 >>> del sub_record.letter_annotations["solexa_quality"] 287 >>> sub_record.letter_annotations 288 {'dummy': 'abcdefghij'} 289 290 You can completely clear the dictionary easily as follows: 291 292 >>> sub_record.letter_annotations = {} 293 >>> sub_record.letter_annotations 294 {} 295 """) 296
297 - def _set_seq(self, value):
298 # TODO - Add a deprecation warning that the seq should be write only? 299 if self._per_letter_annotations: 300 # TODO - Make this a warning? Silently empty the dictionary? 301 raise ValueError("You must empty the letter annotations first!") 302 self._seq = value 303 try: 304 self._per_letter_annotations = _RestrictedDict(length=len(self.seq)) 305 except AttributeError: 306 # e.g. seq is None 307 self._per_letter_annotations = _RestrictedDict(length=0)
308 309 seq = property(fget=lambda self: self._seq, 310 fset=_set_seq, 311 doc="The sequence itself, as a Seq or MutableSeq object.") 312
313 - def __getitem__(self, index):
314 """Returns a sub-sequence or an individual letter. 315 316 Slicing, e.g. my_record[5:10], returns a new SeqRecord for 317 that sub-sequence with appropriate annotation preserved. The 318 name, id and description are kept. 319 320 Any per-letter-annotations are sliced to match the requested 321 sub-sequence. Unless a stride is used, all those features 322 which fall fully within the subsequence are included (with 323 their locations adjusted accordingly). 324 325 However, the annotations dictionary and the dbxrefs list are 326 not used for the new SeqRecord, as in general they may not 327 apply to the subsequence. If you want to preserve them, you 328 must explicitly copy them to the new SeqRecord yourself. 329 330 Using an integer index, e.g. my_record[5] is shorthand for 331 extracting that letter from the sequence, my_record.seq[5]. 332 333 For example, consider this short protein and its secondary 334 structure as encoded by the PDB (e.g. H for alpha helices), 335 plus a simple feature for its histidine self phosphorylation 336 site: 337 338 >>> from Bio.Seq import Seq 339 >>> from Bio.SeqRecord import SeqRecord 340 >>> from Bio.SeqFeature import SeqFeature, FeatureLocation 341 >>> from Bio.Alphabet import IUPAC 342 >>> rec = SeqRecord(Seq("MAAGVKQLADDRTLLMAGVSHDLRTPLTRIRLAT" 343 ... "EMMSEQDGYLAESINKDIEECNAIIEQFIDYLR", 344 ... IUPAC.protein), 345 ... id="1JOY", name="EnvZ", 346 ... description="Homodimeric domain of EnvZ from E. coli") 347 >>> rec.letter_annotations["secondary_structure"] = " S SSSSSSHHHHHTTTHHHHHHHHHHHHHHHHHHHHHHTHHHHHHHHHHHHHHHHHHHHHTT " 348 >>> rec.features.append(SeqFeature(FeatureLocation(20, 21), 349 ... type = "Site")) 350 351 Now let's have a quick look at the full record, 352 353 >>> print(rec) 354 ID: 1JOY 355 Name: EnvZ 356 Description: Homodimeric domain of EnvZ from E. coli 357 Number of features: 1 358 Per letter annotation for: secondary_structure 359 Seq('MAAGVKQLADDRTLLMAGVSHDLRTPLTRIRLATEMMSEQDGYLAESINKDIEE...YLR', IUPACProtein()) 360 >>> rec.letter_annotations["secondary_structure"] 361 ' S SSSSSSHHHHHTTTHHHHHHHHHHHHHHHHHHHHHHTHHHHHHHHHHHHHHHHHHHHHTT ' 362 >>> print(rec.features[0].location) 363 [20:21] 364 365 Now let's take a sub sequence, here chosen as the first (fractured) 366 alpha helix which includes the histidine phosphorylation site: 367 368 >>> sub = rec[11:41] 369 >>> print(sub) 370 ID: 1JOY 371 Name: EnvZ 372 Description: Homodimeric domain of EnvZ from E. coli 373 Number of features: 1 374 Per letter annotation for: secondary_structure 375 Seq('RTLLMAGVSHDLRTPLTRIRLATEMMSEQD', IUPACProtein()) 376 >>> sub.letter_annotations["secondary_structure"] 377 'HHHHHTTTHHHHHHHHHHHHHHHHHHHHHH' 378 >>> print(sub.features[0].location) 379 [9:10] 380 381 You can also of course omit the start or end values, for 382 example to get the first ten letters only: 383 384 >>> print(rec[:10]) 385 ID: 1JOY 386 Name: EnvZ 387 Description: Homodimeric domain of EnvZ from E. coli 388 Number of features: 0 389 Per letter annotation for: secondary_structure 390 Seq('MAAGVKQLAD', IUPACProtein()) 391 392 Or for the last ten letters: 393 394 >>> print(rec[-10:]) 395 ID: 1JOY 396 Name: EnvZ 397 Description: Homodimeric domain of EnvZ from E. coli 398 Number of features: 0 399 Per letter annotation for: secondary_structure 400 Seq('IIEQFIDYLR', IUPACProtein()) 401 402 If you omit both, then you get a copy of the original record (although 403 lacking the annotations and dbxrefs): 404 405 >>> print(rec[:]) 406 ID: 1JOY 407 Name: EnvZ 408 Description: Homodimeric domain of EnvZ from E. coli 409 Number of features: 1 410 Per letter annotation for: secondary_structure 411 Seq('MAAGVKQLADDRTLLMAGVSHDLRTPLTRIRLATEMMSEQDGYLAESINKDIEE...YLR', IUPACProtein()) 412 413 Finally, indexing with a simple integer is shorthand for pulling out 414 that letter from the sequence directly: 415 416 >>> rec[5] 417 'K' 418 >>> rec.seq[5] 419 'K' 420 """ 421 if isinstance(index, int): 422 # NOTE - The sequence level annotation like the id, name, etc 423 # do not really apply to a single character. However, should 424 # we try and expose any per-letter-annotation here? If so how? 425 return self.seq[index] 426 elif isinstance(index, slice): 427 if self.seq is None: 428 raise ValueError("If the sequence is None, we cannot slice it.") 429 parent_length = len(self) 430 answer = self.__class__(self.seq[index], 431 id=self.id, 432 name=self.name, 433 description=self.description) 434 # TODO - The description may no longer apply. 435 # It would be safer to change it to something 436 # generic like "edited" or the default value. 437 438 # Don't copy the annotation dict and dbxefs list, 439 # they may not apply to a subsequence. 440 # answer.annotations = dict(self.annotations.items()) 441 # answer.dbxrefs = self.dbxrefs[:] 442 # TODO - Review this in light of adding SeqRecord objects? 443 444 # TODO - Cope with strides by generating ambiguous locations? 445 start, stop, step = index.indices(parent_length) 446 if step == 1: 447 # Select relevant features, add them with shifted locations 448 # assert str(self.seq)[index] == str(self.seq)[start:stop] 449 for f in self.features: 450 if f.ref or f.ref_db: 451 # TODO - Implement this (with lots of tests)? 452 import warnings 453 warnings.warn("When slicing SeqRecord objects, any " 454 "SeqFeature referencing other sequences (e.g. " 455 "from segmented GenBank records) are ignored.") 456 continue 457 if start <= f.location.nofuzzy_start \ 458 and f.location.nofuzzy_end <= stop: 459 answer.features.append(f._shift(-start)) 460 461 # Slice all the values to match the sliced sequence 462 # (this should also work with strides, even negative strides): 463 for key, value in self.letter_annotations.items(): 464 answer._per_letter_annotations[key] = value[index] 465 466 return answer 467 raise ValueError("Invalid index")
468
469 - def __iter__(self):
470 """Iterate over the letters in the sequence. 471 472 For example, using Bio.SeqIO to read in a protein FASTA file: 473 474 >>> from Bio import SeqIO 475 >>> record = SeqIO.read("Fasta/loveliesbleeding.pro", "fasta") 476 >>> for amino in record: 477 ... print(amino) 478 ... if amino == "L": break 479 X 480 A 481 G 482 L 483 >>> print(record.seq[3]) 484 L 485 486 This is just a shortcut for iterating over the sequence directly: 487 488 >>> for amino in record.seq: 489 ... print(amino) 490 ... if amino == "L": break 491 X 492 A 493 G 494 L 495 >>> print(record.seq[3]) 496 L 497 498 Note that this does not facilitate iteration together with any 499 per-letter-annotation. However, you can achieve that using the 500 python zip function on the record (or its sequence) and the relevant 501 per-letter-annotation: 502 503 >>> from Bio import SeqIO 504 >>> rec = SeqIO.read("Quality/solexa_faked.fastq", "fastq-solexa") 505 >>> print("%s %s" % (rec.id, rec.seq)) 506 slxa_0001_1_0001_01 ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTNNNNNN 507 >>> print(list(rec.letter_annotations)) 508 ['solexa_quality'] 509 >>> for nuc, qual in zip(rec, rec.letter_annotations["solexa_quality"]): 510 ... if qual > 35: 511 ... print("%s %i" % (nuc, qual)) 512 A 40 513 C 39 514 G 38 515 T 37 516 A 36 517 518 You may agree that using zip(rec.seq, ...) is more explicit than using 519 zip(rec, ...) as shown above. 520 """ 521 return iter(self.seq)
522
523 - def __contains__(self, char):
524 """Implements the 'in' keyword, searches the sequence. 525 526 e.g. 527 528 >>> from Bio import SeqIO 529 >>> record = SeqIO.read("Fasta/sweetpea.nu", "fasta") 530 >>> "GAATTC" in record 531 False 532 >>> "AAA" in record 533 True 534 535 This essentially acts as a proxy for using "in" on the sequence: 536 537 >>> "GAATTC" in record.seq 538 False 539 >>> "AAA" in record.seq 540 True 541 542 Note that you can also use Seq objects as the query, 543 544 >>> from Bio.Seq import Seq 545 >>> from Bio.Alphabet import generic_dna 546 >>> Seq("AAA") in record 547 True 548 >>> Seq("AAA", generic_dna) in record 549 True 550 551 See also the Seq object's __contains__ method. 552 """ 553 return char in self.seq
554
555 - def __str__(self):
556 """A human readable summary of the record and its annotation (string). 557 558 The python built in function str works by calling the object's ___str__ 559 method. e.g. 560 561 >>> from Bio.Seq import Seq 562 >>> from Bio.SeqRecord import SeqRecord 563 >>> from Bio.Alphabet import IUPAC 564 >>> record = SeqRecord(Seq("MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF", 565 ... IUPAC.protein), 566 ... id="YP_025292.1", name="HokC", 567 ... description="toxic membrane protein, small") 568 >>> print(str(record)) 569 ID: YP_025292.1 570 Name: HokC 571 Description: toxic membrane protein, small 572 Number of features: 0 573 Seq('MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF', IUPACProtein()) 574 575 In this example you don't actually need to call str explicity, as the 576 print command does this automatically: 577 578 >>> print(record) 579 ID: YP_025292.1 580 Name: HokC 581 Description: toxic membrane protein, small 582 Number of features: 0 583 Seq('MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF', IUPACProtein()) 584 585 Note that long sequences are shown truncated. 586 """ 587 lines = [] 588 if self.id: 589 lines.append("ID: {0}".format(self.id)) 590 if self.name: 591 lines.append("Name: {0}".format(self.name)) 592 if self.description: 593 lines.append("Description: {0}".format(self.description)) 594 if self.dbxrefs: 595 lines.append("Database cross-references: " + ", ".join(self.dbxrefs)) 596 lines.append("Number of features: {0}".format(len(self.features))) 597 for a in self.annotations: 598 lines.append("/{0}={1}".format(a, str(self.annotations[a]))) 599 if self.letter_annotations: 600 lines.append("Per letter annotation for: " + ", ".join(self.letter_annotations)) 601 # Don't want to include the entire sequence, 602 # and showing the alphabet is useful: 603 lines.append(repr(self.seq)) 604 return "\n".join(lines)
605
606 - def __repr__(self):
607 """A concise summary of the record for debugging (string). 608 609 The python built in function repr works by calling the object's ___repr__ 610 method. e.g. 611 612 >>> from Bio.Seq import Seq 613 >>> from Bio.SeqRecord import SeqRecord 614 >>> from Bio.Alphabet import generic_protein 615 >>> rec = SeqRecord(Seq("MASRGVNKVILVGNLGQDPEVRYMPNGGAVANITLATSESWRDKAT" 616 ... +"GEMKEQTEWHRVVLFGKLAEVASEYLRKGSQVYIEGQLRTRKWTDQ" 617 ... +"SGQDRYTTEVVVNVGGTMQMLGGRQGGGAPAGGNIGGGQPQGGWGQ" 618 ... +"PQQPQGGNQFSGGAQSRPQQSAPAAPSNEPPMDFDDDIPF", 619 ... generic_protein), 620 ... id="NP_418483.1", name="b4059", 621 ... description="ssDNA-binding protein", 622 ... dbxrefs=["ASAP:13298", "GI:16131885", "GeneID:948570"]) 623 >>> print(repr(rec)) 624 SeqRecord(seq=Seq('MASRGVNKVILVGNLGQDPEVRYMPNGGAVANITLATSESWRDKATGEMKEQTE...IPF', ProteinAlphabet()), id='NP_418483.1', name='b4059', description='ssDNA-binding protein', dbxrefs=['ASAP:13298', 'GI:16131885', 'GeneID:948570']) 625 626 At the python prompt you can also use this shorthand: 627 628 >>> rec 629 SeqRecord(seq=Seq('MASRGVNKVILVGNLGQDPEVRYMPNGGAVANITLATSESWRDKATGEMKEQTE...IPF', ProteinAlphabet()), id='NP_418483.1', name='b4059', description='ssDNA-binding protein', dbxrefs=['ASAP:13298', 'GI:16131885', 'GeneID:948570']) 630 631 Note that long sequences are shown truncated. Also note that any 632 annotations, letter_annotations and features are not shown (as they 633 would lead to a very long string). 634 """ 635 return "{0}(seq={1!r}, id={2!r}, name={3!r}, description={4!r}, dbxrefs={5!r})".format( 636 self.__class__.__name__, 637 self.seq, self.id, self.name, 638 self.description, self.dbxrefs)
639
640 - def format(self, format):
641 r"""Returns the record as a string in the specified file format. 642 643 The format should be a lower case string supported as an output 644 format by Bio.SeqIO, which is used to turn the SeqRecord into a 645 string. e.g. 646 647 >>> from Bio.Seq import Seq 648 >>> from Bio.SeqRecord import SeqRecord 649 >>> from Bio.Alphabet import IUPAC 650 >>> record = SeqRecord(Seq("MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF", 651 ... IUPAC.protein), 652 ... id="YP_025292.1", name="HokC", 653 ... description="toxic membrane protein") 654 >>> record.format("fasta") 655 '>YP_025292.1 toxic membrane protein\nMKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF\n' 656 >>> print(record.format("fasta")) 657 >YP_025292.1 toxic membrane protein 658 MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF 659 <BLANKLINE> 660 661 The python print command automatically appends a new line, meaning 662 in this example a blank line is shown. If you look at the string 663 representation you can see there is a trailing new line (shown as 664 slash n) which is important when writing to a file or if 665 concatenating multiple sequence strings together. 666 667 Note that this method will NOT work on every possible file format 668 supported by Bio.SeqIO (e.g. some are for multiple sequences only). 669 """ 670 # See also the __format__ added for Python 2.6 / 3.0, PEP 3101 671 # See also the Bio.Align.Generic.Alignment class and its format() 672 return self.__format__(format)
673
674 - def __format__(self, format_spec):
675 """Returns the record as a string in the specified file format. 676 677 This method supports the python format() function added in 678 Python 2.6/3.0. The format_spec should be a lower case string 679 supported by Bio.SeqIO as an output file format. See also the 680 SeqRecord's format() method. 681 682 Under Python 3 please note that for binary formats a bytes 683 string is returned, otherwise a (unicode) string is returned. 684 """ 685 if not format_spec: 686 # Follow python convention and default to using __str__ 687 return str(self) 688 from Bio import SeqIO 689 if format_spec in SeqIO._BinaryFormats: 690 # Return bytes on Python 3 691 from io import BytesIO 692 handle = BytesIO() 693 else: 694 from Bio._py3k import StringIO 695 handle = StringIO() 696 SeqIO.write(self, handle, format_spec) 697 return handle.getvalue()
698
699 - def __len__(self):
700 """Returns the length of the sequence. 701 702 For example, using Bio.SeqIO to read in a FASTA nucleotide file: 703 704 >>> from Bio import SeqIO 705 >>> record = SeqIO.read("Fasta/sweetpea.nu", "fasta") 706 >>> len(record) 707 309 708 >>> len(record.seq) 709 309 710 """ 711 return len(self.seq)
712
713 - def __lt__(self, other):
714 raise NotImplementedError(_NO_SEQRECORD_COMPARISON)
715
716 - def __le___(self, other):
717 raise NotImplementedError(_NO_SEQRECORD_COMPARISON)
718
719 - def __eq__(self, other):
720 raise NotImplementedError(_NO_SEQRECORD_COMPARISON)
721
722 - def __ne__(self, other):
723 raise NotImplementedError(_NO_SEQRECORD_COMPARISON)
724
725 - def __gt__(self, other):
726 raise NotImplementedError(_NO_SEQRECORD_COMPARISON)
727
728 - def __ge__(self, other):
729 raise NotImplementedError(_NO_SEQRECORD_COMPARISON)
730 731 # Note Python 3 does not use __cmp__ and there is no need to 732 # define __cmp__ on Python 2 as have all of _lt__ etc defined. 733 734 # Python 3:
735 - def __bool__(self):
736 """Boolean value of an instance of this class (True). 737 738 This behaviour is for backwards compatibility, since until the 739 __len__ method was added, a SeqRecord always evaluated as True. 740 741 Note that in comparison, a Seq object will evaluate to False if it 742 has a zero length sequence. 743 744 WARNING: The SeqRecord may in future evaluate to False when its 745 sequence is of zero length (in order to better match the Seq 746 object behaviour)! 747 """ 748 return True
749 750 # Python 2: 751 __nonzero__ = __bool__ 752
753 - def __add__(self, other):
754 """Add another sequence or string to this sequence. 755 756 The other sequence can be a SeqRecord object, a Seq object (or 757 similar, e.g. a MutableSeq) or a plain Python string. If you add 758 a plain string or a Seq (like) object, the new SeqRecord will simply 759 have this appended to the existing data. However, any per letter 760 annotation will be lost: 761 762 >>> from Bio import SeqIO 763 >>> record = SeqIO.read("Quality/solexa_faked.fastq", "fastq-solexa") 764 >>> print("%s %s" % (record.id, record.seq)) 765 slxa_0001_1_0001_01 ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTNNNNNN 766 >>> print(list(record.letter_annotations)) 767 ['solexa_quality'] 768 769 >>> new = record + "ACT" 770 >>> print("%s %s" % (new.id, new.seq)) 771 slxa_0001_1_0001_01 ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTNNNNNNACT 772 >>> print(list(new.letter_annotations)) 773 [] 774 775 The new record will attempt to combine the annotation, but for any 776 ambiguities (e.g. different names) it defaults to omitting that 777 annotation. 778 779 >>> from Bio import SeqIO 780 >>> with open("GenBank/pBAD30.gb") as handle: 781 ... plasmid = SeqIO.read(handle, "gb") 782 >>> print("%s %i" % (plasmid.id, len(plasmid))) 783 pBAD30 4923 784 785 Now let's cut the plasmid into two pieces, and join them back up the 786 other way round (i.e. shift the starting point on this plasmid, have 787 a look at the annotated features in the original file to see why this 788 particular split point might make sense): 789 790 >>> left = plasmid[:3765] 791 >>> right = plasmid[3765:] 792 >>> new = right + left 793 >>> print("%s %i" % (new.id, len(new))) 794 pBAD30 4923 795 >>> str(new.seq) == str(right.seq + left.seq) 796 True 797 >>> len(new.features) == len(left.features) + len(right.features) 798 True 799 800 When we add the left and right SeqRecord objects, their annotation 801 is all consistent, so it is all conserved in the new SeqRecord: 802 803 >>> new.id == left.id == right.id == plasmid.id 804 True 805 >>> new.name == left.name == right.name == plasmid.name 806 True 807 >>> new.description == plasmid.description 808 True 809 >>> new.annotations == left.annotations == right.annotations 810 True 811 >>> new.letter_annotations == plasmid.letter_annotations 812 True 813 >>> new.dbxrefs == left.dbxrefs == right.dbxrefs 814 True 815 816 However, we should point out that when we sliced the SeqRecord, 817 any annotations dictionary or dbxrefs list entries were lost. 818 You can explicitly copy them like this: 819 820 >>> new.annotations = plasmid.annotations.copy() 821 >>> new.dbxrefs = plasmid.dbxrefs[:] 822 """ 823 if not isinstance(other, SeqRecord): 824 # Assume it is a string or a Seq. 825 # Note can't transfer any per-letter-annotations 826 return SeqRecord(self.seq + other, 827 id=self.id, name=self.name, 828 description=self.description, 829 features=self.features[:], 830 annotations=self.annotations.copy(), 831 dbxrefs=self.dbxrefs[:]) 832 # Adding two SeqRecord objects... must merge annotation. 833 answer = SeqRecord(self.seq + other.seq, 834 features=self.features[:], 835 dbxrefs=self.dbxrefs[:]) 836 # Will take all the features and all the db cross refs, 837 l = len(self) 838 for f in other.features: 839 answer.features.append(f._shift(l)) 840 del l 841 for ref in other.dbxrefs: 842 if ref not in answer.dbxrefs: 843 answer.dbxrefs.append(ref) 844 # Take common id/name/description/annotation 845 if self.id == other.id: 846 answer.id = self.id 847 if self.name == other.name: 848 answer.name = self.name 849 if self.description == other.description: 850 answer.description = self.description 851 for k, v in self.annotations.items(): 852 if k in other.annotations and other.annotations[k] == v: 853 answer.annotations[k] = v 854 # Can append matching per-letter-annotation 855 for k, v in self.letter_annotations.items(): 856 if k in other.letter_annotations: 857 answer.letter_annotations[k] = v + other.letter_annotations[k] 858 return answer
859
860 - def __radd__(self, other):
861 """Add another sequence or string to this sequence (from the left). 862 863 This method handles adding a Seq object (or similar, e.g. MutableSeq) 864 or a plain Python string (on the left) to a SeqRecord (on the right). 865 See the __add__ method for more details, but for example: 866 867 >>> from Bio import SeqIO 868 >>> record = SeqIO.read("Quality/solexa_faked.fastq", "fastq-solexa") 869 >>> print("%s %s" % (record.id, record.seq)) 870 slxa_0001_1_0001_01 ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTNNNNNN 871 >>> print(list(record.letter_annotations)) 872 ['solexa_quality'] 873 874 >>> new = "ACT" + record 875 >>> print("%s %s" % (new.id, new.seq)) 876 slxa_0001_1_0001_01 ACTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTNNNNNN 877 >>> print(list(new.letter_annotations)) 878 [] 879 """ 880 if isinstance(other, SeqRecord): 881 raise RuntimeError("This should have happened via the __add__ of " 882 "the other SeqRecord being added!") 883 # Assume it is a string or a Seq. 884 # Note can't transfer any per-letter-annotations 885 offset = len(other) 886 return SeqRecord(other + self.seq, 887 id=self.id, name=self.name, 888 description=self.description, 889 features=[f._shift(offset) for f in self.features], 890 annotations=self.annotations.copy(), 891 dbxrefs=self.dbxrefs[:])
892
893 - def upper(self):
894 """Returns a copy of the record with an upper case sequence. 895 896 All the annotation is preserved unchanged. e.g. 897 898 >>> from Bio.Alphabet import generic_dna 899 >>> from Bio.Seq import Seq 900 >>> from Bio.SeqRecord import SeqRecord 901 >>> record = SeqRecord(Seq("acgtACGT", generic_dna), id="Test", 902 ... description = "Made up for this example") 903 >>> record.letter_annotations["phred_quality"] = [1, 2, 3, 4, 5, 6, 7, 8] 904 >>> print(record.upper().format("fastq")) 905 @Test Made up for this example 906 ACGTACGT 907 + 908 "#$%&'() 909 <BLANKLINE> 910 911 Naturally, there is a matching lower method: 912 913 >>> print(record.lower().format("fastq")) 914 @Test Made up for this example 915 acgtacgt 916 + 917 "#$%&'() 918 <BLANKLINE> 919 """ 920 return SeqRecord(self.seq.upper(), 921 id=self.id, name=self.name, 922 description=self.description, 923 dbxrefs=self.dbxrefs[:], 924 features=self.features[:], 925 annotations=self.annotations.copy(), 926 letter_annotations=self.letter_annotations.copy())
927
928 - def lower(self):
929 """Returns a copy of the record with a lower case sequence. 930 931 All the annotation is preserved unchanged. e.g. 932 933 >>> from Bio import SeqIO 934 >>> record = SeqIO.read("Fasta/aster.pro", "fasta") 935 >>> print(record.format("fasta")) 936 >gi|3298468|dbj|BAA31520.1| SAMIPF 937 GGHVNPAVTFGAFVGGNITLLRGIVYIIAQLLGSTVACLLLKFVTNDMAVGVFSLSAGVG 938 VTNALVFEIVMTFGLVYTVYATAIDPKKGSLGTIAPIAIGFIVGANI 939 <BLANKLINE> 940 >>> print(record.lower().format("fasta")) 941 >gi|3298468|dbj|BAA31520.1| SAMIPF 942 gghvnpavtfgafvggnitllrgivyiiaqllgstvaclllkfvtndmavgvfslsagvg 943 vtnalvfeivmtfglvytvyataidpkkgslgtiapiaigfivgani 944 <BLANKLINE> 945 946 To take a more annotation rich example, 947 948 >>> from Bio import SeqIO 949 >>> old = SeqIO.read("EMBL/TRBG361.embl", "embl") 950 >>> len(old.features) 951 3 952 >>> new = old.lower() 953 >>> len(old.features) == len(new.features) 954 True 955 >>> old.annotations["organism"] == new.annotations["organism"] 956 True 957 >>> old.dbxrefs == new.dbxrefs 958 True 959 """ 960 return SeqRecord(self.seq.lower(), 961 id=self.id, name=self.name, 962 description=self.description, 963 dbxrefs=self.dbxrefs[:], 964 features=self.features[:], 965 annotations=self.annotations.copy(), 966 letter_annotations=self.letter_annotations.copy())
967
968 - def reverse_complement(self, id=False, name=False, description=False, 969 features=True, annotations=False, 970 letter_annotations=True, dbxrefs=False):
971 """Returns new SeqRecord with reverse complement sequence. 972 973 You can specify the returned record's id, name and description as 974 strings, or True to keep that of the parent, or False for a default. 975 976 You can specify the returned record's features with a list of 977 SeqFeature objects, or True to keep that of the parent, or False to 978 omit them. The default is to keep the original features (with the 979 strand and locations adjusted). 980 981 You can also specify both the returned record's annotations and 982 letter_annotations as dictionaries, True to keep that of the parent, 983 or False to omit them. The default is to keep the original 984 annotations (with the letter annotations reversed). 985 986 To show what happens to the pre-letter annotations, consider an 987 example Solexa variant FASTQ file with a single entry, which we'll 988 read in as a SeqRecord: 989 990 >>> from Bio import SeqIO 991 >>> record = SeqIO.read("Quality/solexa_faked.fastq", "fastq-solexa") 992 >>> print("%s %s" % (record.id, record.seq)) 993 slxa_0001_1_0001_01 ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTNNNNNN 994 >>> print(list(record.letter_annotations)) 995 ['solexa_quality'] 996 >>> print(record.letter_annotations["solexa_quality"]) 997 [40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, -1, -2, -3, -4, -5] 998 999 Now take the reverse complement, 1000 1001 >>> rc_record = record.reverse_complement(id=record.id+"_rc") 1002 >>> print("%s %s" % (rc_record.id, rc_record.seq)) 1003 slxa_0001_1_0001_01_rc NNNNNNACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT 1004 1005 Notice that the per-letter-annotations have also been reversed, 1006 although this may not be appropriate for all cases. 1007 1008 >>> print(rc_record.letter_annotations["solexa_quality"]) 1009 [-5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40] 1010 1011 Now for the features, we need a different example. Parsing a GenBank 1012 file is probably the easiest way to get an nice example with features 1013 in it... 1014 1015 >>> from Bio import SeqIO 1016 >>> with open("GenBank/pBAD30.gb") as handle: 1017 ... plasmid = SeqIO.read(handle, "gb") 1018 >>> print("%s %i" % (plasmid.id, len(plasmid))) 1019 pBAD30 4923 1020 >>> plasmid.seq 1021 Seq('GCTAGCGGAGTGTATACTGGCTTACTATGTTGGCACTGATGAGGGTGTCAGTGA...ATG', IUPACAmbiguousDNA()) 1022 >>> len(plasmid.features) 1023 13 1024 1025 Now, let's take the reverse complement of this whole plasmid: 1026 1027 >>> rc_plasmid = plasmid.reverse_complement(id=plasmid.id+"_rc") 1028 >>> print("%s %i" % (rc_plasmid.id, len(rc_plasmid))) 1029 pBAD30_rc 4923 1030 >>> rc_plasmid.seq 1031 Seq('CATGGGCAAATATTATACGCAAGGCGACAAGGTGCTGATGCCGCTGGCGATTCA...AGC', IUPACAmbiguousDNA()) 1032 >>> len(rc_plasmid.features) 1033 13 1034 1035 Let's compare the first CDS feature - it has gone from being the 1036 second feature (index 1) to the second last feature (index -2), its 1037 strand has changed, and the location switched round. 1038 1039 >>> print(plasmid.features[1]) 1040 type: CDS 1041 location: [1081:1960](-) 1042 qualifiers: 1043 Key: label, Value: ['araC'] 1044 Key: note, Value: ['araC regulator of the arabinose BAD promoter'] 1045 Key: vntifkey, Value: ['4'] 1046 <BLANKLINE> 1047 >>> print(rc_plasmid.features[-2]) 1048 type: CDS 1049 location: [2963:3842](+) 1050 qualifiers: 1051 Key: label, Value: ['araC'] 1052 Key: note, Value: ['araC regulator of the arabinose BAD promoter'] 1053 Key: vntifkey, Value: ['4'] 1054 <BLANKLINE> 1055 1056 You can check this new location, based on the length of the plasmid: 1057 1058 >>> len(plasmid) - 1081 1059 3842 1060 >>> len(plasmid) - 1960 1061 2963 1062 1063 Note that if the SeqFeature annotation includes any strand specific 1064 information (e.g. base changes for a SNP), this information is not 1065 amended, and would need correction after the reverse complement. 1066 1067 Note trying to reverse complement a protein SeqRecord raises an 1068 exception: 1069 1070 >>> from Bio.SeqRecord import SeqRecord 1071 >>> from Bio.Seq import Seq 1072 >>> from Bio.Alphabet import IUPAC 1073 >>> protein_rec = SeqRecord(Seq("MAIVMGR", IUPAC.protein), id="Test") 1074 >>> protein_rec.reverse_complement() 1075 Traceback (most recent call last): 1076 ... 1077 ValueError: Proteins do not have complements! 1078 1079 Also note you can reverse complement a SeqRecord using a MutableSeq: 1080 1081 >>> from Bio.SeqRecord import SeqRecord 1082 >>> from Bio.Seq import MutableSeq 1083 >>> from Bio.Alphabet import generic_dna 1084 >>> rec = SeqRecord(MutableSeq("ACGT", generic_dna), id="Test") 1085 >>> rec.seq[0] = "T" 1086 >>> print("%s %s" % (rec.id, rec.seq)) 1087 Test TCGT 1088 >>> rc = rec.reverse_complement(id=True) 1089 >>> print("%s %s" % (rc.id, rc.seq)) 1090 Test ACGA 1091 """ 1092 from Bio.Seq import MutableSeq # Lazy to avoid circular imports 1093 if isinstance(self.seq, MutableSeq): 1094 # Currently the MutableSeq reverse complement is in situ 1095 answer = SeqRecord(self.seq.toseq().reverse_complement()) 1096 else: 1097 answer = SeqRecord(self.seq.reverse_complement()) 1098 if isinstance(id, basestring): 1099 answer.id = id 1100 elif id: 1101 answer.id = self.id 1102 if isinstance(name, basestring): 1103 answer.name = name 1104 elif name: 1105 answer.name = self.name 1106 if isinstance(description, basestring): 1107 answer.description = description 1108 elif description: 1109 answer.description = self.description 1110 if isinstance(dbxrefs, list): 1111 answer.dbxrefs = dbxrefs 1112 elif dbxrefs: 1113 # Copy the old dbxrefs 1114 answer.dbxrefs = self.dbxrefs[:] 1115 if isinstance(features, list): 1116 answer.features = features 1117 elif features: 1118 # Copy the old features, adjusting location and string 1119 l = len(answer) 1120 answer.features = [f._flip(l) for f in self.features] 1121 # The old list should have been sorted by start location, 1122 # reversing it will leave it sorted by what is now the end position, 1123 # so we need to resort in case of overlapping features. 1124 # NOTE - In the common case of gene before CDS (and similar) with 1125 # the exact same locations, this will still maintain gene before CDS 1126 answer.features.sort(key=lambda x: x.location.start.position) 1127 if isinstance(annotations, dict): 1128 answer.annotations = annotations 1129 elif annotations: 1130 # Copy the old annotations, 1131 answer.annotations = self.annotations.copy() 1132 if isinstance(letter_annotations, dict): 1133 answer.letter_annotations = letter_annotations 1134 elif letter_annotations: 1135 # Copy the old per letter annotations, reversing them 1136 for key, value in self.letter_annotations.items(): 1137 answer._per_letter_annotations[key] = value[::-1] 1138 return answer
1139 1140 1141 if __name__ == "__main__": 1142 from Bio._utils import run_doctest 1143 run_doctest() 1144