1
2
3
4
5
6
7
8 """Represent a Sequence Record, a sequence with annotation."""
9 __docformat__ = "epytext en"
10
11
12
13
14
15
16
18 """Dict which only allows sequences of given length as values (PRIVATE).
19
20 This simple subclass of the Python dictionary is used in the SeqRecord
21 object for holding per-letter-annotations. This class is intended to
22 prevent simple errors by only allowing python sequences (e.g. lists,
23 strings and tuples) to be stored, and only if their length matches that
24 expected (the length of the SeqRecord's seq object). It cannot however
25 prevent the entries being edited in situ (for example appending entries
26 to a list).
27
28 >>> x = _RestrictedDict(5)
29 >>> x["test"] = "hello"
30 >>> x
31 {'test': 'hello'}
32
33 Adding entries which don't have the expected length are blocked:
34
35 >>> x["test"] = "hello world"
36 Traceback (most recent call last):
37 ...
38 TypeError: We only allow python sequences (lists, tuples or strings) of length 5.
39
40 The expected length is stored as a private attribute,
41
42 >>> x._length
43 5
44
45 In order that the SeqRecord (and other objects using this class) can be
46 pickled, for example for use in the multiprocessing library, we need to
47 be able to pickle the restricted dictionary objects.
48
49 Using the default protocol, which is 0 on Python 2.x,
50
51 >>> import pickle
52 >>> y = pickle.loads(pickle.dumps(x))
53 >>> y
54 {'test': 'hello'}
55 >>> y._length
56 5
57
58 Using the highest protocol, which is 2 on Python 2.x,
59
60 >>> import pickle
61 >>> z = pickle.loads(pickle.dumps(x, pickle.HIGHEST_PROTOCOL))
62 >>> z
63 {'test': 'hello'}
64 >>> z._length
65 5
66 """
67
72
74
75
76 if not hasattr(value, "__len__") or not hasattr(value, "__getitem__") \
77 or (hasattr(self, "_length") and len(value) != self._length):
78 raise TypeError("We only allow python sequences (lists, tuples or "
79 "strings) of length %i." % self._length)
80 dict.__setitem__(self, key, value)
81
86
87
89 """A SeqRecord object holds a sequence and information about it.
90
91 Main attributes:
92 - id - Identifier such as a locus tag (string)
93 - seq - The sequence itself (Seq object or similar)
94
95 Additional attributes:
96 - name - Sequence name, e.g. gene name (string)
97 - description - Additional text (string)
98 - dbxrefs - List of database cross references (list of strings)
99 - features - Any (sub)features defined (list of SeqFeature objects)
100 - annotations - Further information about the whole sequence (dictionary).
101 Most entries are strings, or lists of strings.
102 - letter_annotations - Per letter/symbol annotation (restricted
103 dictionary). This holds Python sequences (lists, strings
104 or tuples) whose length matches that of the sequence.
105 A typical use would be to hold a list of integers
106 representing sequencing quality scores, or a string
107 representing the secondary structure.
108
109 You will typically use Bio.SeqIO to read in sequences from files as
110 SeqRecord objects. However, you may want to create your own SeqRecord
111 objects directly (see the __init__ method for further details):
112
113 >>> from Bio.Seq import Seq
114 >>> from Bio.SeqRecord import SeqRecord
115 >>> from Bio.Alphabet import IUPAC
116 >>> record = SeqRecord(Seq("MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF",
117 ... IUPAC.protein),
118 ... id="YP_025292.1", name="HokC",
119 ... description="toxic membrane protein")
120 >>> print record
121 ID: YP_025292.1
122 Name: HokC
123 Description: toxic membrane protein
124 Number of features: 0
125 Seq('MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF', IUPACProtein())
126
127 If you want to save SeqRecord objects to a sequence file, use Bio.SeqIO
128 for this. For the special case where you want the SeqRecord turned into
129 a string in a particular file format there is a format method which uses
130 Bio.SeqIO internally:
131
132 >>> print record.format("fasta")
133 >YP_025292.1 toxic membrane protein
134 MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF
135 <BLANKLINE>
136
137 You can also do things like slicing a SeqRecord, checking its length, etc
138
139 >>> len(record)
140 44
141 >>> edited = record[:10] + record[11:]
142 >>> print edited.seq
143 MKQHKAMIVAIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF
144 >>> print record.seq
145 MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF
146
147 """
148 - def __init__(self, seq, id = "<unknown id>", name = "<unknown name>",
149 description = "<unknown description>", dbxrefs = None,
150 features = None, annotations = None,
151 letter_annotations = None):
152 """Create a SeqRecord.
153
154 Arguments:
155 - seq - Sequence, required (Seq, MutableSeq or UnknownSeq)
156 - id - Sequence identifier, recommended (string)
157 - name - Sequence name, optional (string)
158 - description - Sequence description, optional (string)
159 - dbxrefs - Database cross references, optional (list of strings)
160 - features - Any (sub)features, optional (list of SeqFeature objects)
161 - annotations - Dictionary of annotations for the whole sequence
162 - letter_annotations - Dictionary of per-letter-annotations, values
163 should be strings, list or tuples of the same
164 length as the full sequence.
165
166 You will typically use Bio.SeqIO to read in sequences from files as
167 SeqRecord objects. However, you may want to create your own SeqRecord
168 objects directly.
169
170 Note that while an id is optional, we strongly recommend you supply a
171 unique id string for each record. This is especially important
172 if you wish to write your sequences to a file.
173
174 If you don't have the actual sequence, but you do know its length,
175 then using the UnknownSeq object from Bio.Seq is appropriate.
176
177 You can create a 'blank' SeqRecord object, and then populate the
178 attributes later.
179 """
180 if id is not None and not isinstance(id, basestring):
181
182 raise TypeError("id argument should be a string")
183 if not isinstance(name, basestring):
184 raise TypeError("name argument should be a string")
185 if not isinstance(description, basestring):
186 raise TypeError("description argument should be a string")
187 self._seq = seq
188 self.id = id
189 self.name = name
190 self.description = description
191
192
193 if dbxrefs is None:
194 dbxrefs = []
195 elif not isinstance(dbxrefs, list):
196 raise TypeError("dbxrefs argument should be a list (of strings)")
197 self.dbxrefs = dbxrefs
198
199
200 if annotations is None:
201 annotations = {}
202 elif not isinstance(annotations, dict):
203 raise TypeError("annotations argument should be a dict")
204 self.annotations = annotations
205
206 if letter_annotations is None:
207
208 if seq is None:
209
210 self._per_letter_annotations = _RestrictedDict(length=0)
211 else:
212 try:
213 self._per_letter_annotations = \
214 _RestrictedDict(length=len(seq))
215 except:
216 raise TypeError("seq argument should be a Seq object or similar")
217 else:
218
219
220
221 self.letter_annotations = letter_annotations
222
223
224 if features is None:
225 features = []
226 elif not isinstance(features, list):
227 raise TypeError("features argument should be a list (of SeqFeature objects)")
228 self.features = features
229
230
232 if not isinstance(value, dict):
233 raise TypeError("The per-letter-annotations should be a "
234 "(restricted) dictionary.")
235
236 try:
237 self._per_letter_annotations = _RestrictedDict(length=len(self.seq))
238 except AttributeError:
239
240 self._per_letter_annotations = _RestrictedDict(length=0)
241 self._per_letter_annotations.update(value)
242 letter_annotations = property(
243 fget=lambda self: self._per_letter_annotations,
244 fset=_set_per_letter_annotations,
245 doc="""Dictionary of per-letter-annotation for the sequence.
246
247 For example, this can hold quality scores used in FASTQ or QUAL files.
248 Consider this example using Bio.SeqIO to read in an example Solexa
249 variant FASTQ file as a SeqRecord:
250
251 >>> from Bio import SeqIO
252 >>> handle = open("Quality/solexa_faked.fastq", "rU")
253 >>> record = SeqIO.read(handle, "fastq-solexa")
254 >>> handle.close()
255 >>> print record.id, record.seq
256 slxa_0001_1_0001_01 ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTNNNNNN
257 >>> print record.letter_annotations.keys()
258 ['solexa_quality']
259 >>> print record.letter_annotations["solexa_quality"]
260 [40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, -1, -2, -3, -4, -5]
261
262 The letter_annotations get sliced automatically if you slice the
263 parent SeqRecord, for example taking the last ten bases:
264
265 >>> sub_record = record[-10:]
266 >>> print sub_record.id, sub_record.seq
267 slxa_0001_1_0001_01 ACGTNNNNNN
268 >>> print sub_record.letter_annotations["solexa_quality"]
269 [4, 3, 2, 1, 0, -1, -2, -3, -4, -5]
270
271 Any python sequence (i.e. list, tuple or string) can be recorded in
272 the SeqRecord's letter_annotations dictionary as long as the length
273 matches that of the SeqRecord's sequence. e.g.
274
275 >>> len(sub_record.letter_annotations)
276 1
277 >>> sub_record.letter_annotations["dummy"] = "abcdefghij"
278 >>> len(sub_record.letter_annotations)
279 2
280
281 You can delete entries from the letter_annotations dictionary as usual:
282
283 >>> del sub_record.letter_annotations["solexa_quality"]
284 >>> sub_record.letter_annotations
285 {'dummy': 'abcdefghij'}
286
287 You can completely clear the dictionary easily as follows:
288
289 >>> sub_record.letter_annotations = {}
290 >>> sub_record.letter_annotations
291 {}
292 """)
293
295
296 if self._per_letter_annotations:
297
298 raise ValueError("You must empty the letter annotations first!")
299 self._seq = value
300 try:
301 self._per_letter_annotations = _RestrictedDict(length=len(self.seq))
302 except AttributeError:
303
304 self._per_letter_annotations = _RestrictedDict(length=0)
305
306 seq = property(fget=lambda self: self._seq,
307 fset=_set_seq,
308 doc="The sequence itself, as a Seq or MutableSeq object.")
309
311 """Returns a sub-sequence or an individual letter.
312
313 Slicing, e.g. my_record[5:10], returns a new SeqRecord for
314 that sub-sequence with approriate annotation preserved. The
315 name, id and description are kept.
316
317 Any per-letter-annotations are sliced to match the requested
318 sub-sequence. Unless a stride is used, all those features
319 which fall fully within the subsequence are included (with
320 their locations adjusted accordingly).
321
322 However, the annotations dictionary and the dbxrefs list are
323 not used for the new SeqRecord, as in general they may not
324 apply to the subsequence. If you want to preserve them, you
325 must explictly copy them to the new SeqRecord yourself.
326
327 Using an integer index, e.g. my_record[5] is shorthand for
328 extracting that letter from the sequence, my_record.seq[5].
329
330 For example, consider this short protein and its secondary
331 structure as encoded by the PDB (e.g. H for alpha helices),
332 plus a simple feature for its histidine self phosphorylation
333 site:
334
335 >>> from Bio.Seq import Seq
336 >>> from Bio.SeqRecord import SeqRecord
337 >>> from Bio.SeqFeature import SeqFeature, FeatureLocation
338 >>> from Bio.Alphabet import IUPAC
339 >>> rec = SeqRecord(Seq("MAAGVKQLADDRTLLMAGVSHDLRTPLTRIRLAT"
340 ... "EMMSEQDGYLAESINKDIEECNAIIEQFIDYLR",
341 ... IUPAC.protein),
342 ... id="1JOY", name="EnvZ",
343 ... description="Homodimeric domain of EnvZ from E. coli")
344 >>> rec.letter_annotations["secondary_structure"] = " S SSSSSSHHHHHTTTHHHHHHHHHHHHHHHHHHHHHHTHHHHHHHHHHHHHHHHHHHHHTT "
345 >>> rec.features.append(SeqFeature(FeatureLocation(20,21),
346 ... type = "Site"))
347
348 Now let's have a quick look at the full record,
349
350 >>> print rec
351 ID: 1JOY
352 Name: EnvZ
353 Description: Homodimeric domain of EnvZ from E. coli
354 Number of features: 1
355 Per letter annotation for: secondary_structure
356 Seq('MAAGVKQLADDRTLLMAGVSHDLRTPLTRIRLATEMMSEQDGYLAESINKDIEE...YLR', IUPACProtein())
357 >>> print rec.letter_annotations["secondary_structure"]
358 S SSSSSSHHHHHTTTHHHHHHHHHHHHHHHHHHHHHHTHHHHHHHHHHHHHHHHHHHHHTT
359 >>> print rec.features[0].location
360 [20:21]
361
362 Now let's take a sub sequence, here chosen as the first (fractured)
363 alpha helix which includes the histidine phosphorylation site:
364
365 >>> sub = rec[11:41]
366 >>> print sub
367 ID: 1JOY
368 Name: EnvZ
369 Description: Homodimeric domain of EnvZ from E. coli
370 Number of features: 1
371 Per letter annotation for: secondary_structure
372 Seq('RTLLMAGVSHDLRTPLTRIRLATEMMSEQD', IUPACProtein())
373 >>> print sub.letter_annotations["secondary_structure"]
374 HHHHHTTTHHHHHHHHHHHHHHHHHHHHHH
375 >>> print sub.features[0].location
376 [9:10]
377
378 You can also of course omit the start or end values, for
379 example to get the first ten letters only:
380
381 >>> print rec[:10]
382 ID: 1JOY
383 Name: EnvZ
384 Description: Homodimeric domain of EnvZ from E. coli
385 Number of features: 0
386 Per letter annotation for: secondary_structure
387 Seq('MAAGVKQLAD', IUPACProtein())
388
389 Or for the last ten letters:
390
391 >>> print rec[-10:]
392 ID: 1JOY
393 Name: EnvZ
394 Description: Homodimeric domain of EnvZ from E. coli
395 Number of features: 0
396 Per letter annotation for: secondary_structure
397 Seq('IIEQFIDYLR', IUPACProtein())
398
399 If you omit both, then you get a copy of the original record (although
400 lacking the annotations and dbxrefs):
401
402 >>> print rec[:]
403 ID: 1JOY
404 Name: EnvZ
405 Description: Homodimeric domain of EnvZ from E. coli
406 Number of features: 1
407 Per letter annotation for: secondary_structure
408 Seq('MAAGVKQLADDRTLLMAGVSHDLRTPLTRIRLATEMMSEQDGYLAESINKDIEE...YLR', IUPACProtein())
409
410 Finally, indexing with a simple integer is shorthand for pulling out
411 that letter from the sequence directly:
412
413 >>> rec[5]
414 'K'
415 >>> rec.seq[5]
416 'K'
417 """
418 if isinstance(index, int):
419
420
421
422 return self.seq[index]
423 elif isinstance(index, slice):
424 if self.seq is None:
425 raise ValueError("If the sequence is None, we cannot slice it.")
426 parent_length = len(self)
427 answer = self.__class__(self.seq[index],
428 id=self.id,
429 name=self.name,
430 description=self.description)
431
432
433
434
435
436
437
438
439
440
441
442 start, stop, step = index.indices(parent_length)
443 if step == 1:
444
445
446 for f in self.features:
447 if f.ref or f.ref_db:
448
449 import warnings
450 warnings.warn("When slicing SeqRecord objects, any "
451 "SeqFeature referencing other sequences (e.g. "
452 "from segmented GenBank records) are ignored.")
453 continue
454 if start <= f.location.nofuzzy_start \
455 and f.location.nofuzzy_end <= stop:
456 answer.features.append(f._shift(-start))
457
458
459
460 for key, value in self.letter_annotations.iteritems():
461 answer._per_letter_annotations[key] = value[index]
462
463 return answer
464 raise ValueError("Invalid index")
465
467 """Iterate over the letters in the sequence.
468
469 For example, using Bio.SeqIO to read in a protein FASTA file:
470
471 >>> from Bio import SeqIO
472 >>> record = SeqIO.read("Fasta/loveliesbleeding.pro", "fasta")
473 >>> for amino in record:
474 ... print amino
475 ... if amino == "L": break
476 X
477 A
478 G
479 L
480 >>> print record.seq[3]
481 L
482
483 This is just a shortcut for iterating over the sequence directly:
484
485 >>> for amino in record.seq:
486 ... print amino
487 ... if amino == "L": break
488 X
489 A
490 G
491 L
492 >>> print record.seq[3]
493 L
494
495 Note that this does not facilitate iteration together with any
496 per-letter-annotation. However, you can achieve that using the
497 python zip function on the record (or its sequence) and the relevant
498 per-letter-annotation:
499
500 >>> from Bio import SeqIO
501 >>> rec = SeqIO.read(open("Quality/solexa_faked.fastq", "rU"),
502 ... "fastq-solexa")
503 >>> print rec.id, rec.seq
504 slxa_0001_1_0001_01 ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTNNNNNN
505 >>> print rec.letter_annotations.keys()
506 ['solexa_quality']
507 >>> for nuc, qual in zip(rec,rec.letter_annotations["solexa_quality"]):
508 ... if qual > 35:
509 ... print nuc, qual
510 A 40
511 C 39
512 G 38
513 T 37
514 A 36
515
516 You may agree that using zip(rec.seq, ...) is more explicit than using
517 zip(rec, ...) as shown above.
518 """
519 return iter(self.seq)
520
522 """Implements the 'in' keyword, searches the sequence.
523
524 e.g.
525
526 >>> from Bio import SeqIO
527 >>> record = SeqIO.read("Fasta/sweetpea.nu", "fasta")
528 >>> "GAATTC" in record
529 False
530 >>> "AAA" in record
531 True
532
533 This essentially acts as a proxy for using "in" on the sequence:
534
535 >>> "GAATTC" in record.seq
536 False
537 >>> "AAA" in record.seq
538 True
539
540 Note that you can also use Seq objects as the query,
541
542 >>> from Bio.Seq import Seq
543 >>> from Bio.Alphabet import generic_dna
544 >>> Seq("AAA") in record
545 True
546 >>> Seq("AAA", generic_dna) in record
547 True
548
549 See also the Seq object's __contains__ method.
550 """
551 return char in self.seq
552
554 """A human readable summary of the record and its annotation (string).
555
556 The python built in function str works by calling the object's ___str__
557 method. e.g.
558
559 >>> from Bio.Seq import Seq
560 >>> from Bio.SeqRecord import SeqRecord
561 >>> from Bio.Alphabet import IUPAC
562 >>> record = SeqRecord(Seq("MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF",
563 ... IUPAC.protein),
564 ... id="YP_025292.1", name="HokC",
565 ... description="toxic membrane protein, small")
566 >>> print str(record)
567 ID: YP_025292.1
568 Name: HokC
569 Description: toxic membrane protein, small
570 Number of features: 0
571 Seq('MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF', IUPACProtein())
572
573 In this example you don't actually need to call str explicity, as the
574 print command does this automatically:
575
576 >>> print record
577 ID: YP_025292.1
578 Name: HokC
579 Description: toxic membrane protein, small
580 Number of features: 0
581 Seq('MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF', IUPACProtein())
582
583 Note that long sequences are shown truncated.
584 """
585 lines = []
586 if self.id:
587 lines.append("ID: %s" % self.id)
588 if self.name:
589 lines.append("Name: %s" % self.name)
590 if self.description:
591 lines.append("Description: %s" % self.description)
592 if self.dbxrefs:
593 lines.append("Database cross-references: "
594 + ", ".join(self.dbxrefs))
595 lines.append("Number of features: %i" % len(self.features))
596 for a in self.annotations:
597 lines.append("/%s=%s" % (a, str(self.annotations[a])))
598 if self.letter_annotations:
599 lines.append("Per letter annotation for: "
600 + ", ".join(self.letter_annotations.keys()))
601
602
603 lines.append(repr(self.seq))
604 return "\n".join(lines)
605
607 """A concise summary of the record for debugging (string).
608
609 The python built in function repr works by calling the object's ___repr__
610 method. e.g.
611
612 >>> from Bio.Seq import Seq
613 >>> from Bio.SeqRecord import SeqRecord
614 >>> from Bio.Alphabet import generic_protein
615 >>> rec = SeqRecord(Seq("MASRGVNKVILVGNLGQDPEVRYMPNGGAVANITLATSESWRDKAT"
616 ... +"GEMKEQTEWHRVVLFGKLAEVASEYLRKGSQVYIEGQLRTRKWTDQ"
617 ... +"SGQDRYTTEVVVNVGGTMQMLGGRQGGGAPAGGNIGGGQPQGGWGQ"
618 ... +"PQQPQGGNQFSGGAQSRPQQSAPAAPSNEPPMDFDDDIPF",
619 ... generic_protein),
620 ... id="NP_418483.1", name="b4059",
621 ... description="ssDNA-binding protein",
622 ... dbxrefs=["ASAP:13298", "GI:16131885", "GeneID:948570"])
623 >>> print repr(rec)
624 SeqRecord(seq=Seq('MASRGVNKVILVGNLGQDPEVRYMPNGGAVANITLATSESWRDKATGEMKEQTE...IPF', ProteinAlphabet()), id='NP_418483.1', name='b4059', description='ssDNA-binding protein', dbxrefs=['ASAP:13298', 'GI:16131885', 'GeneID:948570'])
625
626 At the python prompt you can also use this shorthand:
627
628 >>> rec
629 SeqRecord(seq=Seq('MASRGVNKVILVGNLGQDPEVRYMPNGGAVANITLATSESWRDKATGEMKEQTE...IPF', ProteinAlphabet()), id='NP_418483.1', name='b4059', description='ssDNA-binding protein', dbxrefs=['ASAP:13298', 'GI:16131885', 'GeneID:948570'])
630
631 Note that long sequences are shown truncated. Also note that any
632 annotations, letter_annotations and features are not shown (as they
633 would lead to a very long string).
634 """
635 return self.__class__.__name__ \
636 + "(seq=%s, id=%s, name=%s, description=%s, dbxrefs=%s)" \
637 % tuple(map(repr, (self.seq, self.id, self.name,
638 self.description, self.dbxrefs)))
639
673
701
703 """Returns the length of the sequence.
704
705 For example, using Bio.SeqIO to read in a FASTA nucleotide file:
706
707 >>> from Bio import SeqIO
708 >>> record = SeqIO.read("Fasta/sweetpea.nu", "fasta")
709 >>> len(record)
710 309
711 >>> len(record.seq)
712 309
713 """
714 return len(self.seq)
715
717 """Returns True regardless of the length of the sequence.
718
719 This behaviour is for backwards compatibility, since until the
720 __len__ method was added, a SeqRecord always evaluated as True.
721
722 Note that in comparison, a Seq object will evaluate to False if it
723 has a zero length sequence.
724
725 WARNING: The SeqRecord may in future evaluate to False when its
726 sequence is of zero length (in order to better match the Seq
727 object behaviour)!
728 """
729 return True
730
732 """Add another sequence or string to this sequence.
733
734 The other sequence can be a SeqRecord object, a Seq object (or
735 similar, e.g. a MutableSeq) or a plain Python string. If you add
736 a plain string or a Seq (like) object, the new SeqRecord will simply
737 have this appended to the existing data. However, any per letter
738 annotation will be lost:
739
740 >>> from Bio import SeqIO
741 >>> handle = open("Quality/solexa_faked.fastq", "rU")
742 >>> record = SeqIO.read(handle, "fastq-solexa")
743 >>> handle.close()
744 >>> print record.id, record.seq
745 slxa_0001_1_0001_01 ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTNNNNNN
746 >>> print record.letter_annotations.keys()
747 ['solexa_quality']
748
749 >>> new = record + "ACT"
750 >>> print new.id, new.seq
751 slxa_0001_1_0001_01 ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTNNNNNNACT
752 >>> print new.letter_annotations.keys()
753 []
754
755 The new record will attempt to combine the annotation, but for any
756 ambiguities (e.g. different names) it defaults to omitting that
757 annotation.
758
759 >>> from Bio import SeqIO
760 >>> handle = open("GenBank/pBAD30.gb")
761 >>> plasmid = SeqIO.read(handle, "gb")
762 >>> handle.close()
763 >>> print plasmid.id, len(plasmid)
764 pBAD30 4923
765
766 Now let's cut the plasmid into two pieces, and join them back up the
767 other way round (i.e. shift the starting point on this plasmid, have
768 a look at the annotated features in the original file to see why this
769 particular split point might make sense):
770
771 >>> left = plasmid[:3765]
772 >>> right = plasmid[3765:]
773 >>> new = right + left
774 >>> print new.id, len(new)
775 pBAD30 4923
776 >>> str(new.seq) == str(right.seq + left.seq)
777 True
778 >>> len(new.features) == len(left.features) + len(right.features)
779 True
780
781 When we add the left and right SeqRecord objects, their annotation
782 is all consistent, so it is all conserved in the new SeqRecord:
783
784 >>> new.id == left.id == right.id == plasmid.id
785 True
786 >>> new.name == left.name == right.name == plasmid.name
787 True
788 >>> new.description == plasmid.description
789 True
790 >>> new.annotations == left.annotations == right.annotations
791 True
792 >>> new.letter_annotations == plasmid.letter_annotations
793 True
794 >>> new.dbxrefs == left.dbxrefs == right.dbxrefs
795 True
796
797 However, we should point out that when we sliced the SeqRecord,
798 any annotations dictionary or dbxrefs list entries were lost.
799 You can explicitly copy them like this:
800
801 >>> new.annotations = plasmid.annotations.copy()
802 >>> new.dbxrefs = plasmid.dbxrefs[:]
803 """
804 if not isinstance(other, SeqRecord):
805
806
807 return SeqRecord(self.seq + other,
808 id = self.id, name = self.name,
809 description = self.description,
810 features = self.features[:],
811 annotations = self.annotations.copy(),
812 dbxrefs = self.dbxrefs[:])
813
814 answer = SeqRecord(self.seq + other.seq,
815 features = self.features[:],
816 dbxrefs = self.dbxrefs[:])
817
818 l = len(self)
819 for f in other.features:
820 answer.features.append(f._shift(l))
821 del l
822 for ref in other.dbxrefs:
823 if ref not in answer.dbxrefs:
824 answer.dbxrefs.append(ref)
825
826 if self.id == other.id:
827 answer.id = self.id
828 if self.name == other.name:
829 answer.name = self.name
830 if self.description == other.description:
831 answer.description = self.description
832 for k, v in self.annotations.iteritems():
833 if k in other.annotations and other.annotations[k] == v:
834 answer.annotations[k] = v
835
836 for k, v in self.letter_annotations.iteritems():
837 if k in other.letter_annotations:
838 answer.letter_annotations[k] = v + other.letter_annotations[k]
839 return answer
840
842 """Add another sequence or string to this sequence (from the left).
843
844 This method handles adding a Seq object (or similar, e.g. MutableSeq)
845 or a plain Python string (on the left) to a SeqRecord (on the right).
846 See the __add__ method for more details, but for example:
847
848 >>> from Bio import SeqIO
849 >>> handle = open("Quality/solexa_faked.fastq", "rU")
850 >>> record = SeqIO.read(handle, "fastq-solexa")
851 >>> handle.close()
852 >>> print record.id, record.seq
853 slxa_0001_1_0001_01 ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTNNNNNN
854 >>> print record.letter_annotations.keys()
855 ['solexa_quality']
856
857 >>> new = "ACT" + record
858 >>> print new.id, new.seq
859 slxa_0001_1_0001_01 ACTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTNNNNNN
860 >>> print new.letter_annotations.keys()
861 []
862 """
863 if isinstance(other, SeqRecord):
864 raise RuntimeError("This should have happened via the __add__ of "
865 "the other SeqRecord being added!")
866
867
868 offset = len(other)
869 return SeqRecord(other + self.seq,
870 id = self.id, name = self.name,
871 description = self.description,
872 features = [f._shift(offset) for f in self.features],
873 annotations = self.annotations.copy(),
874 dbxrefs = self.dbxrefs[:])
875
877 """Returns a copy of the record with an upper case sequence.
878
879 All the annotation is preserved unchanged. e.g.
880
881 >>> from Bio.Alphabet import generic_dna
882 >>> from Bio.Seq import Seq
883 >>> from Bio.SeqRecord import SeqRecord
884 >>> record = SeqRecord(Seq("acgtACGT", generic_dna), id="Test",
885 ... description = "Made up for this example")
886 >>> record.letter_annotations["phred_quality"] = [1,2,3,4,5,6,7,8]
887 >>> print record.upper().format("fastq")
888 @Test Made up for this example
889 ACGTACGT
890 +
891 "#$%&'()
892 <BLANKLINE>
893
894 Naturally, there is a matching lower method:
895
896 >>> print record.lower().format("fastq")
897 @Test Made up for this example
898 acgtacgt
899 +
900 "#$%&'()
901 <BLANKLINE>
902 """
903 return SeqRecord(self.seq.upper(),
904 id = self.id, name = self.name,
905 description = self.description,
906 dbxrefs = self.dbxrefs[:],
907 features = self.features[:],
908 annotations = self.annotations.copy(),
909 letter_annotations=self.letter_annotations.copy())
910
912 """Returns a copy of the record with a lower case sequence.
913
914 All the annotation is preserved unchanged. e.g.
915
916 >>> from Bio import SeqIO
917 >>> record = SeqIO.read("Fasta/aster.pro", "fasta")
918 >>> print record.format("fasta")
919 >gi|3298468|dbj|BAA31520.1| SAMIPF
920 GGHVNPAVTFGAFVGGNITLLRGIVYIIAQLLGSTVACLLLKFVTNDMAVGVFSLSAGVG
921 VTNALVFEIVMTFGLVYTVYATAIDPKKGSLGTIAPIAIGFIVGANI
922 <BLANKLINE>
923 >>> print record.lower().format("fasta")
924 >gi|3298468|dbj|BAA31520.1| SAMIPF
925 gghvnpavtfgafvggnitllrgivyiiaqllgstvaclllkfvtndmavgvfslsagvg
926 vtnalvfeivmtfglvytvyataidpkkgslgtiapiaigfivgani
927 <BLANKLINE>
928
929 To take a more annotation rich example,
930
931 >>> from Bio import SeqIO
932 >>> old = SeqIO.read("EMBL/TRBG361.embl", "embl")
933 >>> len(old.features)
934 3
935 >>> new = old.lower()
936 >>> len(old.features) == len(new.features)
937 True
938 >>> old.annotations["organism"] == new.annotations["organism"]
939 True
940 >>> old.dbxrefs == new.dbxrefs
941 True
942 """
943 return SeqRecord(self.seq.lower(),
944 id = self.id, name = self.name,
945 description = self.description,
946 dbxrefs = self.dbxrefs[:],
947 features = self.features[:],
948 annotations = self.annotations.copy(),
949 letter_annotations=self.letter_annotations.copy())
950
951 - def reverse_complement(self, id=False, name=False, description=False,
952 features=True, annotations=False,
953 letter_annotations=True, dbxrefs=False):
954 """Returns new SeqRecord with reverse complement sequence.
955
956 You can specify the returned record's id, name and description as
957 strings, or True to keep that of the parent, or False for a default.
958
959 You can specify the returned record's features with a list of
960 SeqFeature objects, or True to keep that of the parent, or False to
961 omit them. The default is to keep the original features (with the
962 strand and locations adjusted).
963
964 You can also specify both the returned record's annotations and
965 letter_annotations as dictionaries, True to keep that of the parent,
966 or False to omit them. The default is to keep the original
967 annotations (with the letter annotations reversed).
968
969 To show what happens to the pre-letter annotations, consider an
970 example Solexa variant FASTQ file with a single entry, which we'll
971 read in as a SeqRecord:
972
973 >>> from Bio import SeqIO
974 >>> handle = open("Quality/solexa_faked.fastq", "rU")
975 >>> record = SeqIO.read(handle, "fastq-solexa")
976 >>> handle.close()
977 >>> print record.id, record.seq
978 slxa_0001_1_0001_01 ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTNNNNNN
979 >>> print record.letter_annotations.keys()
980 ['solexa_quality']
981 >>> print record.letter_annotations["solexa_quality"]
982 [40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, -1, -2, -3, -4, -5]
983
984 Now take the reverse complement,
985
986 >>> rc_record = record.reverse_complement(id=record.id+"_rc")
987 >>> print rc_record.id, rc_record.seq
988 slxa_0001_1_0001_01_rc NNNNNNACGTACGTACGTACGTACGTACGTACGTACGTACGTACGT
989
990 Notice that the per-letter-annotations have also been reversed,
991 although this may not be appropriate for all cases.
992
993 >>> print rc_record.letter_annotations["solexa_quality"]
994 [-5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40]
995
996 Now for the features, we need a different example. Parsing a GenBank
997 file is probably the easiest way to get an nice example with features
998 in it...
999
1000 >>> from Bio import SeqIO
1001 >>> handle = open("GenBank/pBAD30.gb")
1002 >>> plasmid = SeqIO.read(handle, "gb")
1003 >>> handle.close()
1004 >>> print plasmid.id, len(plasmid)
1005 pBAD30 4923
1006 >>> plasmid.seq
1007 Seq('GCTAGCGGAGTGTATACTGGCTTACTATGTTGGCACTGATGAGGGTGTCAGTGA...ATG', IUPACAmbiguousDNA())
1008 >>> len(plasmid.features)
1009 13
1010
1011 Now, let's take the reverse complement of this whole plasmid:
1012
1013 >>> rc_plasmid = plasmid.reverse_complement(id=plasmid.id+"_rc")
1014 >>> print rc_plasmid.id, len(rc_plasmid)
1015 pBAD30_rc 4923
1016 >>> rc_plasmid.seq
1017 Seq('CATGGGCAAATATTATACGCAAGGCGACAAGGTGCTGATGCCGCTGGCGATTCA...AGC', IUPACAmbiguousDNA())
1018 >>> len(rc_plasmid.features)
1019 13
1020
1021 Let's compare the first CDS feature - it has gone from being the
1022 second feature (index 1) to the second last feature (index -2), its
1023 strand has changed, and the location switched round.
1024
1025 >>> print plasmid.features[1]
1026 type: CDS
1027 location: [1081:1960](-)
1028 qualifiers:
1029 Key: label, Value: ['araC']
1030 Key: note, Value: ['araC regulator of the arabinose BAD promoter']
1031 Key: vntifkey, Value: ['4']
1032 <BLANKLINE>
1033 >>> print rc_plasmid.features[-2]
1034 type: CDS
1035 location: [2963:3842](+)
1036 qualifiers:
1037 Key: label, Value: ['araC']
1038 Key: note, Value: ['araC regulator of the arabinose BAD promoter']
1039 Key: vntifkey, Value: ['4']
1040 <BLANKLINE>
1041
1042 You can check this new location, based on the length of the plasmid:
1043
1044 >>> len(plasmid) - 1081
1045 3842
1046 >>> len(plasmid) - 1960
1047 2963
1048
1049 Note that if the SeqFeature annotation includes any strand specific
1050 information (e.g. base changes for a SNP), this information is not
1051 ammended, and would need correction after the reverse complement.
1052
1053 Note trying to reverse complement a protein SeqRecord raises an
1054 exception:
1055
1056 >>> from Bio.SeqRecord import SeqRecord
1057 >>> from Bio.Seq import Seq
1058 >>> from Bio.Alphabet import IUPAC
1059 >>> protein_rec = SeqRecord(Seq("MAIVMGR", IUPAC.protein), id="Test")
1060 >>> protein_rec.reverse_complement()
1061 Traceback (most recent call last):
1062 ...
1063 ValueError: Proteins do not have complements!
1064
1065 Also note you can reverse complement a SeqRecord using a MutableSeq:
1066
1067 >>> from Bio.SeqRecord import SeqRecord
1068 >>> from Bio.Seq import MutableSeq
1069 >>> from Bio.Alphabet import generic_dna
1070 >>> rec = SeqRecord(MutableSeq("ACGT", generic_dna), id="Test")
1071 >>> rec.seq[0] = "T"
1072 >>> print rec.id, rec.seq
1073 Test TCGT
1074 >>> rc = rec.reverse_complement(id=True)
1075 >>> print rc.id, rc.seq
1076 Test ACGA
1077 """
1078 from Bio.Seq import MutableSeq
1079 if isinstance(self.seq, MutableSeq):
1080
1081 answer = SeqRecord(self.seq.toseq().reverse_complement())
1082 else:
1083 answer = SeqRecord(self.seq.reverse_complement())
1084 if isinstance(id, basestring):
1085 answer.id = id
1086 elif id:
1087 answer.id = self.id
1088 if isinstance(name, basestring):
1089 answer.name = name
1090 elif name:
1091 answer.name = self.name
1092 if isinstance(description, basestring):
1093 answer.description = description
1094 elif description:
1095 answer.description = self.description
1096 if isinstance(dbxrefs, list):
1097 answer.dbxrefs = dbxrefs
1098 elif dbxrefs:
1099
1100 answer.dbxrefs = self.dbxrefs[:]
1101 if isinstance(features, list):
1102 answer.features = features
1103 elif features:
1104
1105 l = len(answer)
1106 answer.features = [f._flip(l) for f in self.features]
1107
1108
1109
1110
1111
1112 answer.features.sort(key=lambda x: x.location.start.position)
1113 if isinstance(annotations, dict):
1114 answer.annotations = annotations
1115 elif annotations:
1116
1117 answer.annotations = self.annotations.copy()
1118 if isinstance(letter_annotations, dict):
1119 answer.letter_annotations = letter_annotations
1120 elif letter_annotations:
1121
1122 for key, value in self.letter_annotations.iteritems():
1123 answer._per_letter_annotations[key] = value[::-1]
1124 return answer
1125
1126
1127 if __name__ == "__main__":
1128 from Bio._utils import run_doctest
1129 run_doctest()
1130