Package Bio :: Package Align
[hide private]
[frames] | no frames]

Source Code for Package Bio.Align

  1  # Copyright 2008-2011 by Peter Cock. 
  2  # All rights reserved. 
  3  # This code is part of the Biopython distribution and governed by its 
  4  # license.  Please see the LICENSE file that should have been included 
  5  # as part of this package. 
  6  """Code for dealing with sequence alignments. 
  7   
  8  One of the most important things in this module is the MultipleSeqAlignment 
  9  class, used in the Bio.AlignIO module. 
 10   
 11  """ 
 12  from __future__ import print_function 
 13   
 14  from Bio.Seq import Seq 
 15  from Bio.SeqRecord import SeqRecord 
 16  from Bio import Alphabet 
 17   
 18   
19 -class MultipleSeqAlignment(object):
20 """Represents a classical multiple sequence alignment (MSA). 21 22 By this we mean a collection of sequences (usually shown as rows) which 23 are all the same length (usually with gap characters for insertions or 24 padding). The data can then be regarded as a matrix of letters, with well 25 defined columns. 26 27 You would typically create an MSA by loading an alignment file with the 28 AlignIO module: 29 30 >>> from Bio import AlignIO 31 >>> align = AlignIO.read("Clustalw/opuntia.aln", "clustal") 32 >>> print(align) 33 SingleLetterAlphabet() alignment with 7 rows and 156 columns 34 TATACATTAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273285|gb|AF191659.1|AF191 35 TATACATTAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273284|gb|AF191658.1|AF191 36 TATACATTAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273287|gb|AF191661.1|AF191 37 TATACATAAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273286|gb|AF191660.1|AF191 38 TATACATTAAAGGAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273290|gb|AF191664.1|AF191 39 TATACATTAAAGGAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273289|gb|AF191663.1|AF191 40 TATACATTAAAGGAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273291|gb|AF191665.1|AF191 41 42 In some respects you can treat these objects as lists of SeqRecord objects, 43 each representing a row of the alignment. Iterating over an alignment gives 44 the SeqRecord object for each row: 45 46 >>> len(align) 47 7 48 >>> for record in align: 49 ... print("%s %i" % (record.id, len(record))) 50 gi|6273285|gb|AF191659.1|AF191 156 51 gi|6273284|gb|AF191658.1|AF191 156 52 gi|6273287|gb|AF191661.1|AF191 156 53 gi|6273286|gb|AF191660.1|AF191 156 54 gi|6273290|gb|AF191664.1|AF191 156 55 gi|6273289|gb|AF191663.1|AF191 156 56 gi|6273291|gb|AF191665.1|AF191 156 57 58 You can also access individual rows as SeqRecord objects via their index: 59 60 >>> print(align[0].id) 61 gi|6273285|gb|AF191659.1|AF191 62 >>> print(align[-1].id) 63 gi|6273291|gb|AF191665.1|AF191 64 65 And extract columns as strings: 66 67 >>> print(align[:, 1]) 68 AAAAAAA 69 70 Or, take just the first ten columns as a sub-alignment: 71 72 >>> print(align[:, :10]) 73 SingleLetterAlphabet() alignment with 7 rows and 10 columns 74 TATACATTAA gi|6273285|gb|AF191659.1|AF191 75 TATACATTAA gi|6273284|gb|AF191658.1|AF191 76 TATACATTAA gi|6273287|gb|AF191661.1|AF191 77 TATACATAAA gi|6273286|gb|AF191660.1|AF191 78 TATACATTAA gi|6273290|gb|AF191664.1|AF191 79 TATACATTAA gi|6273289|gb|AF191663.1|AF191 80 TATACATTAA gi|6273291|gb|AF191665.1|AF191 81 82 Combining this alignment slicing with alignment addition allows you to 83 remove a section of the alignment. For example, taking just the first 84 and last ten columns: 85 86 >>> print(align[:, :10] + align[:, -10:]) 87 SingleLetterAlphabet() alignment with 7 rows and 20 columns 88 TATACATTAAGTGTACCAGA gi|6273285|gb|AF191659.1|AF191 89 TATACATTAAGTGTACCAGA gi|6273284|gb|AF191658.1|AF191 90 TATACATTAAGTGTACCAGA gi|6273287|gb|AF191661.1|AF191 91 TATACATAAAGTGTACCAGA gi|6273286|gb|AF191660.1|AF191 92 TATACATTAAGTGTACCAGA gi|6273290|gb|AF191664.1|AF191 93 TATACATTAAGTATACCAGA gi|6273289|gb|AF191663.1|AF191 94 TATACATTAAGTGTACCAGA gi|6273291|gb|AF191665.1|AF191 95 96 Note - This object replaced the older Alignment object defined in module 97 Bio.Align.Generic but is not fully backwards compatible with it. 98 99 Note - This object does NOT attempt to model the kind of alignments used 100 in next generation sequencing with multiple sequencing reads which are 101 much shorter than the alignment, and where there is usually a consensus or 102 reference sequence with special status. 103 """ 104
105 - def __init__(self, records, alphabet=None, 106 annotations=None):
107 """Initialize a new MultipleSeqAlignment object. 108 109 Arguments: 110 - records - A list (or iterator) of SeqRecord objects, whose 111 sequences are all the same length. This may be an be an 112 empty list. 113 - alphabet - The alphabet for the whole alignment, typically a gapped 114 alphabet, which should be a super-set of the individual 115 record alphabets. If omitted, a consensus alphabet is 116 used. 117 - annotations - Information about the whole alignment (dictionary). 118 119 You would normally load a MSA from a file using Bio.AlignIO, but you 120 can do this from a list of SeqRecord objects too: 121 122 >>> from Bio.Alphabet import generic_dna 123 >>> from Bio.Seq import Seq 124 >>> from Bio.SeqRecord import SeqRecord 125 >>> from Bio.Align import MultipleSeqAlignment 126 >>> a = SeqRecord(Seq("AAAACGT", generic_dna), id="Alpha") 127 >>> b = SeqRecord(Seq("AAA-CGT", generic_dna), id="Beta") 128 >>> c = SeqRecord(Seq("AAAAGGT", generic_dna), id="Gamma") 129 >>> align = MultipleSeqAlignment([a, b, c], annotations={"tool": "demo"}) 130 >>> print(align) 131 DNAAlphabet() alignment with 3 rows and 7 columns 132 AAAACGT Alpha 133 AAA-CGT Beta 134 AAAAGGT Gamma 135 >>> align.annotations 136 {'tool': 'demo'} 137 138 NOTE - The older Bio.Align.Generic.Alignment class only accepted a 139 single argument, an alphabet. This is still supported via a backwards 140 compatible "hack" so as not to disrupt existing scripts and users, but 141 is deprecated and will be removed in a future release. 142 """ 143 if isinstance(records, (Alphabet.Alphabet, Alphabet.AlphabetEncoder)): 144 if alphabet is None: 145 # TODO - Remove this backwards compatible mode! 146 alphabet = records 147 records = [] 148 import warnings 149 from Bio import BiopythonDeprecationWarning 150 warnings.warn("Invalid records argument: While the old " 151 "Bio.Align.Generic.Alignment class only " 152 "accepted a single argument (the alphabet), the " 153 "newer Bio.Align.MultipleSeqAlignment class " 154 "expects a list/iterator of SeqRecord objects " 155 "(which can be an empty list) and an optional " 156 "alphabet argument", BiopythonDeprecationWarning) 157 else: 158 raise ValueError("Invalid records argument") 159 if alphabet is not None: 160 if not isinstance(alphabet, (Alphabet.Alphabet, Alphabet.AlphabetEncoder)): 161 raise ValueError("Invalid alphabet argument") 162 self._alphabet = alphabet 163 else: 164 # Default while we add sequences, will take a consensus later 165 self._alphabet = Alphabet.single_letter_alphabet 166 167 self._records = [] 168 if records: 169 self.extend(records) 170 if alphabet is None: 171 # No alphabet was given, take a consensus alphabet 172 self._alphabet = Alphabet._consensus_alphabet(rec.seq.alphabet for 173 rec in self._records 174 if rec.seq is not None) 175 176 # Annotations about the whole alignment 177 if annotations is None: 178 annotations = {} 179 elif not isinstance(annotations, dict): 180 raise TypeError("annotations argument should be a dict") 181 self.annotations = annotations
182
183 - def _str_line(self, record, length=50):
184 """Return a truncated string representation of a SeqRecord (PRIVATE). 185 186 This is a PRIVATE function used by the __str__ method. 187 """ 188 if record.seq.__class__.__name__ == "CodonSeq": 189 if len(record.seq) <= length: 190 return "%s %s" % (record.seq, record.id) 191 else: 192 return "%s...%s %s" \ 193 % (record.seq[:length - 3], record.seq[-3:], record.id) 194 else: 195 if len(record.seq) <= length: 196 return "%s %s" % (record.seq, record.id) 197 else: 198 return "%s...%s %s" \ 199 % (record.seq[:length - 6], record.seq[-3:], record.id)
200
201 - def __str__(self):
202 """Return a multi-line string summary of the alignment. 203 204 This output is intended to be readable, but large alignments are 205 shown truncated. A maximum of 20 rows (sequences) and 50 columns 206 are shown, with the record identifiers. This should fit nicely on a 207 single screen. e.g. 208 209 >>> from Bio.Alphabet import IUPAC, Gapped 210 >>> from Bio.Align import MultipleSeqAlignment 211 >>> align = MultipleSeqAlignment([], Gapped(IUPAC.unambiguous_dna, "-")) 212 >>> align.add_sequence("Alpha", "ACTGCTAGCTAG") 213 >>> align.add_sequence("Beta", "ACT-CTAGCTAG") 214 >>> align.add_sequence("Gamma", "ACTGCTAGATAG") 215 >>> print(align) 216 Gapped(IUPACUnambiguousDNA(), '-') alignment with 3 rows and 12 columns 217 ACTGCTAGCTAG Alpha 218 ACT-CTAGCTAG Beta 219 ACTGCTAGATAG Gamma 220 221 See also the alignment's format method. 222 """ 223 rows = len(self._records) 224 lines = ["%s alignment with %i rows and %i columns" 225 % (str(self._alphabet), rows, self.get_alignment_length())] 226 if rows <= 20: 227 lines.extend(self._str_line(rec) for rec in self._records) 228 else: 229 lines.extend(self._str_line(rec) for rec in self._records[:18]) 230 lines.append("...") 231 lines.append(self._str_line(self._records[-1])) 232 return "\n".join(lines)
233
234 - def __repr__(self):
235 """Return a representation of the object for debugging. 236 237 The representation cannot be used with eval() to recreate the object, 238 which is usually possible with simple python ojects. For example: 239 240 <Bio.Align.MultipleSeqAlignment instance (2 records of length 14, 241 SingleLetterAlphabet()) at a3c184c> 242 243 The hex string is the memory address of the object, see help(id). 244 This provides a simple way to visually distinguish alignments of 245 the same size. 246 """ 247 # A doctest for __repr__ would be nice, but __class__ comes out differently 248 # if run via the __main__ trick. 249 return "<%s instance (%i records of length %i, %s) at %x>" % \ 250 (self.__class__, len(self._records), 251 self.get_alignment_length(), repr(self._alphabet), id(self))
252 # This version is useful for doing eval(repr(alignment)), 253 # but it can be VERY long: 254 # return "%s(%s, %s)" \ 255 # % (self.__class__, repr(self._records), repr(self._alphabet)) 256
257 - def format(self, format):
258 """Return the alignment as a string in the specified file format. 259 260 The format should be a lower case string supported as an output 261 format by Bio.AlignIO (such as "fasta", "clustal", "phylip", 262 "stockholm", etc), which is used to turn the alignment into a 263 string. 264 265 e.g. 266 267 >>> from Bio.Alphabet import IUPAC, Gapped 268 >>> from Bio.Align import MultipleSeqAlignment 269 >>> align = MultipleSeqAlignment([], Gapped(IUPAC.unambiguous_dna, "-")) 270 >>> align.add_sequence("Alpha", "ACTGCTAGCTAG") 271 >>> align.add_sequence("Beta", "ACT-CTAGCTAG") 272 >>> align.add_sequence("Gamma", "ACTGCTAGATAG") 273 >>> print(align.format("fasta")) 274 >Alpha 275 ACTGCTAGCTAG 276 >Beta 277 ACT-CTAGCTAG 278 >Gamma 279 ACTGCTAGATAG 280 <BLANKLINE> 281 >>> print(align.format("phylip")) 282 3 12 283 Alpha ACTGCTAGCT AG 284 Beta ACT-CTAGCT AG 285 Gamma ACTGCTAGAT AG 286 <BLANKLINE> 287 288 For Python 2.6, 3.0 or later see also the built in format() function. 289 """ 290 # See also the __format__ added for Python 2.6 / 3.0, PEP 3101 291 # See also the SeqRecord class and its format() method using Bio.SeqIO 292 return self.__format__(format)
293
294 - def __format__(self, format_spec):
295 """Return the alignment as a string in the specified file format. 296 297 This method supports the python format() function added in 298 Python 2.6/3.0. The format_spec should be a lower case 299 string supported by Bio.AlignIO as an output file format. 300 See also the alignment's format() method. 301 """ 302 if format_spec: 303 from Bio._py3k import StringIO 304 from Bio import AlignIO 305 handle = StringIO() 306 AlignIO.write([self], handle, format_spec) 307 return handle.getvalue() 308 else: 309 # Follow python convention and default to using __str__ 310 return str(self)
311
312 - def __iter__(self):
313 """Iterate over alignment rows as SeqRecord objects. 314 315 e.g. 316 317 >>> from Bio.Alphabet import IUPAC, Gapped 318 >>> from Bio.Align import MultipleSeqAlignment 319 >>> align = MultipleSeqAlignment([], Gapped(IUPAC.unambiguous_dna, "-")) 320 >>> align.add_sequence("Alpha", "ACTGCTAGCTAG") 321 >>> align.add_sequence("Beta", "ACT-CTAGCTAG") 322 >>> align.add_sequence("Gamma", "ACTGCTAGATAG") 323 >>> for record in align: 324 ... print(record.id) 325 ... print(record.seq) 326 Alpha 327 ACTGCTAGCTAG 328 Beta 329 ACT-CTAGCTAG 330 Gamma 331 ACTGCTAGATAG 332 """ 333 return iter(self._records)
334
335 - def __len__(self):
336 """Return the number of sequences in the alignment. 337 338 Use len(alignment) to get the number of sequences (i.e. the number of 339 rows), and alignment.get_alignment_length() to get the length of the 340 longest sequence (i.e. the number of columns). 341 342 This is easy to remember if you think of the alignment as being like a 343 list of SeqRecord objects. 344 """ 345 return len(self._records)
346
347 - def get_alignment_length(self):
348 """Return the maximum length of the alignment. 349 350 All objects in the alignment should (hopefully) have the same 351 length. This function will go through and find this length 352 by finding the maximum length of sequences in the alignment. 353 354 >>> from Bio.Alphabet import IUPAC, Gapped 355 >>> from Bio.Align import MultipleSeqAlignment 356 >>> align = MultipleSeqAlignment([], Gapped(IUPAC.unambiguous_dna, "-")) 357 >>> align.add_sequence("Alpha", "ACTGCTAGCTAG") 358 >>> align.add_sequence("Beta", "ACT-CTAGCTAG") 359 >>> align.add_sequence("Gamma", "ACTGCTAGATAG") 360 >>> align.get_alignment_length() 361 12 362 363 If you want to know the number of sequences in the alignment, 364 use len(align) instead: 365 366 >>> len(align) 367 3 368 369 """ 370 max_length = 0 371 372 for record in self._records: 373 if len(record.seq) > max_length: 374 max_length = len(record.seq) 375 376 return max_length
377
378 - def add_sequence(self, descriptor, sequence, start=None, end=None, 379 weight=1.0):
380 """Add a sequence to the alignment. 381 382 This doesn't do any kind of alignment, it just adds in the sequence 383 object, which is assumed to be prealigned with the existing 384 sequences. 385 386 Arguments: 387 - descriptor - The descriptive id of the sequence being added. 388 This will be used as the resulting SeqRecord's 389 .id property (and, for historical compatibility, 390 also the .description property) 391 - sequence - A string with sequence info. 392 - start - You can explicitly set the start point of the sequence. 393 This is useful (at least) for BLAST alignments, which can 394 just be partial alignments of sequences. 395 - end - Specify the end of the sequence, which is important 396 for the same reason as the start. 397 - weight - The weight to place on the sequence in the alignment. 398 By default, all sequences have the same weight. (0.0 => 399 no weight, 1.0 => highest weight) 400 401 In general providing a SeqRecord and calling .append is prefered. 402 """ 403 new_seq = Seq(sequence, self._alphabet) 404 405 # We are now effectively using the SeqRecord's .id as 406 # the primary identifier (e.g. in Bio.SeqIO) so we should 407 # populate it with the descriptor. 408 # For backwards compatibility, also store this in the 409 # SeqRecord's description property. 410 new_record = SeqRecord(new_seq, 411 id=descriptor, 412 description=descriptor) 413 414 # hack! We really need to work out how to deal with annotations 415 # and features in biopython. Right now, I'll just use the 416 # generic annotations dictionary we've got to store the start 417 # and end, but we should think up something better. I don't know 418 # if I'm really a big fan of the LocatableSeq thing they've got 419 # in BioPerl, but I'm not positive what the best thing to do on 420 # this is... 421 if start: 422 new_record.annotations['start'] = start 423 if end: 424 new_record.annotations['end'] = end 425 426 # another hack to add weight information to the sequence 427 new_record.annotations['weight'] = weight 428 429 self._records.append(new_record)
430
431 - def extend(self, records):
432 """Add more SeqRecord objects to the alignment as rows. 433 434 They must all have the same length as the original alignment, and have 435 alphabets compatible with the alignment's alphabet. For example, 436 437 >>> from Bio.Alphabet import generic_dna 438 >>> from Bio.Seq import Seq 439 >>> from Bio.SeqRecord import SeqRecord 440 >>> from Bio.Align import MultipleSeqAlignment 441 >>> a = SeqRecord(Seq("AAAACGT", generic_dna), id="Alpha") 442 >>> b = SeqRecord(Seq("AAA-CGT", generic_dna), id="Beta") 443 >>> c = SeqRecord(Seq("AAAAGGT", generic_dna), id="Gamma") 444 >>> d = SeqRecord(Seq("AAAACGT", generic_dna), id="Delta") 445 >>> e = SeqRecord(Seq("AAA-GGT", generic_dna), id="Epsilon") 446 447 First we create a small alignment (three rows): 448 449 >>> align = MultipleSeqAlignment([a, b, c]) 450 >>> print(align) 451 DNAAlphabet() alignment with 3 rows and 7 columns 452 AAAACGT Alpha 453 AAA-CGT Beta 454 AAAAGGT Gamma 455 456 Now we can extend this alignment with another two rows: 457 458 >>> align.extend([d, e]) 459 >>> print(align) 460 DNAAlphabet() alignment with 5 rows and 7 columns 461 AAAACGT Alpha 462 AAA-CGT Beta 463 AAAAGGT Gamma 464 AAAACGT Delta 465 AAA-GGT Epsilon 466 467 Because the alignment object allows iteration over the rows as 468 SeqRecords, you can use the extend method with a second alignment 469 (provided its sequences have the same length as the original alignment). 470 """ 471 if len(self): 472 # Use the standard method to get the length 473 expected_length = self.get_alignment_length() 474 else: 475 # Take the first record's length 476 records = iter(records) # records arg could be list or iterator 477 try: 478 rec = next(records) 479 except StopIteration: 480 # Special case, no records 481 return 482 expected_length = len(rec) 483 self._append(rec, expected_length) 484 # Now continue to the rest of the records as usual 485 486 for rec in records: 487 self._append(rec, expected_length)
488
489 - def append(self, record):
490 """Add one more SeqRecord object to the alignment as a new row. 491 492 This must have the same length as the original alignment (unless this is 493 the first record), and have an alphabet compatible with the alignment's 494 alphabet. 495 496 >>> from Bio import AlignIO 497 >>> align = AlignIO.read("Clustalw/opuntia.aln", "clustal") 498 >>> print(align) 499 SingleLetterAlphabet() alignment with 7 rows and 156 columns 500 TATACATTAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273285|gb|AF191659.1|AF191 501 TATACATTAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273284|gb|AF191658.1|AF191 502 TATACATTAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273287|gb|AF191661.1|AF191 503 TATACATAAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273286|gb|AF191660.1|AF191 504 TATACATTAAAGGAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273290|gb|AF191664.1|AF191 505 TATACATTAAAGGAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273289|gb|AF191663.1|AF191 506 TATACATTAAAGGAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273291|gb|AF191665.1|AF191 507 >>> len(align) 508 7 509 510 We'll now construct a dummy record to append as an example: 511 512 >>> from Bio.Seq import Seq 513 >>> from Bio.SeqRecord import SeqRecord 514 >>> dummy = SeqRecord(Seq("N"*156), id="dummy") 515 516 Now append this to the alignment, 517 518 >>> align.append(dummy) 519 >>> print(align) 520 SingleLetterAlphabet() alignment with 8 rows and 156 columns 521 TATACATTAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273285|gb|AF191659.1|AF191 522 TATACATTAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273284|gb|AF191658.1|AF191 523 TATACATTAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273287|gb|AF191661.1|AF191 524 TATACATAAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273286|gb|AF191660.1|AF191 525 TATACATTAAAGGAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273290|gb|AF191664.1|AF191 526 TATACATTAAAGGAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273289|gb|AF191663.1|AF191 527 TATACATTAAAGGAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273291|gb|AF191665.1|AF191 528 NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN...NNN dummy 529 >>> len(align) 530 8 531 532 """ 533 if self._records: 534 self._append(record, self.get_alignment_length()) 535 else: 536 self._append(record)
537
538 - def _append(self, record, expected_length=None):
539 """Validate and append a record (PRIVATE).""" 540 if not isinstance(record, SeqRecord): 541 raise TypeError("New sequence is not a SeqRecord object") 542 543 # Currently the get_alignment_length() call is expensive, so we need 544 # to avoid calling it repeatedly for __init__ and extend, hence this 545 # private _append method 546 if expected_length is not None and len(record) != expected_length: 547 # TODO - Use the following more helpful error, but update unit tests 548 # raise ValueError("New sequence is not of length %i" \ 549 # % self.get_alignment_length()) 550 raise ValueError("Sequences must all be the same length") 551 552 # Using not self.alphabet.contains(record.seq.alphabet) needs fixing 553 # for AlphabetEncoders (e.g. gapped versus ungapped). 554 if not Alphabet._check_type_compatible([self._alphabet, record.seq.alphabet]): 555 raise ValueError("New sequence's alphabet is incompatible") 556 self._records.append(record)
557
558 - def __add__(self, other):
559 """Combine two alignments with the same number of rows by adding them. 560 561 If you have two multiple sequence alignments (MSAs), there are two ways to think 562 about adding them - by row or by column. Using the extend method adds by row. 563 Using the addition operator adds by column. For example, 564 565 >>> from Bio.Alphabet import generic_dna 566 >>> from Bio.Seq import Seq 567 >>> from Bio.SeqRecord import SeqRecord 568 >>> from Bio.Align import MultipleSeqAlignment 569 >>> a1 = SeqRecord(Seq("AAAAC", generic_dna), id="Alpha") 570 >>> b1 = SeqRecord(Seq("AAA-C", generic_dna), id="Beta") 571 >>> c1 = SeqRecord(Seq("AAAAG", generic_dna), id="Gamma") 572 >>> a2 = SeqRecord(Seq("GT", generic_dna), id="Alpha") 573 >>> b2 = SeqRecord(Seq("GT", generic_dna), id="Beta") 574 >>> c2 = SeqRecord(Seq("GT", generic_dna), id="Gamma") 575 >>> left = MultipleSeqAlignment([a1, b1, c1], 576 ... annotations={"tool": "demo", "name": "start"}) 577 >>> right = MultipleSeqAlignment([a2, b2, c2], 578 ... annotations={"tool": "demo", "name": "end"}) 579 580 Now, let's look at these two alignments: 581 582 >>> print(left) 583 DNAAlphabet() alignment with 3 rows and 5 columns 584 AAAAC Alpha 585 AAA-C Beta 586 AAAAG Gamma 587 >>> print(right) 588 DNAAlphabet() alignment with 3 rows and 2 columns 589 GT Alpha 590 GT Beta 591 GT Gamma 592 593 And add them: 594 595 >>> combined = left + right 596 >>> print(combined) 597 DNAAlphabet() alignment with 3 rows and 7 columns 598 AAAACGT Alpha 599 AAA-CGT Beta 600 AAAAGGT Gamma 601 602 For this to work, both alignments must have the same number of records (here 603 they both have 3 rows): 604 605 >>> len(left) 606 3 607 >>> len(right) 608 3 609 >>> len(combined) 610 3 611 612 The individual rows are SeqRecord objects, and these can be added together. Refer 613 to the SeqRecord documentation for details of how the annotation is handled. This 614 example is a special case in that both original alignments shared the same names, 615 meaning when the rows are added they also get the same name. 616 617 Any common annotations are preserved, but differing annotation is lost. This is 618 the same behaviour used in the SeqRecord annotations and is designed to prevent 619 accidental propagation of inappropriate values: 620 621 >>> combined.annotations 622 {'tool': 'demo'} 623 624 """ 625 if not isinstance(other, MultipleSeqAlignment): 626 raise NotImplementedError 627 if len(self) != len(other): 628 raise ValueError("When adding two alignments they must have the same length" 629 " (i.e. same number or rows)") 630 alpha = Alphabet._consensus_alphabet([self._alphabet, other._alphabet]) 631 merged = (left + right for left, right in zip(self, other)) 632 # Take any common annotation: 633 annotations = dict() 634 for k, v in self.annotations.items(): 635 if k in other.annotations and other.annotations[k] == v: 636 annotations[k] = v 637 return MultipleSeqAlignment(merged, alpha, annotations)
638
639 - def __getitem__(self, index):
640 """Access part of the alignment. 641 642 Depending on the indices, you can get a SeqRecord object 643 (representing a single row), a Seq object (for a single columns), 644 a string (for a single characters) or another alignment 645 (representing some part or all of the alignment). 646 647 align[r,c] gives a single character as a string 648 align[r] gives a row as a SeqRecord 649 align[r,:] gives a row as a SeqRecord 650 align[:,c] gives a column as a Seq (using the alignment's alphabet) 651 652 align[:] and align[:,:] give a copy of the alignment 653 654 Anything else gives a sub alignment, e.g. 655 align[0:2] or align[0:2,:] uses only row 0 and 1 656 align[:,1:3] uses only columns 1 and 2 657 align[0:2,1:3] uses only rows 0 & 1 and only cols 1 & 2 658 659 We'll use the following example alignment here for illustration: 660 661 >>> from Bio.Alphabet import generic_dna 662 >>> from Bio.Seq import Seq 663 >>> from Bio.SeqRecord import SeqRecord 664 >>> from Bio.Align import MultipleSeqAlignment 665 >>> a = SeqRecord(Seq("AAAACGT", generic_dna), id="Alpha") 666 >>> b = SeqRecord(Seq("AAA-CGT", generic_dna), id="Beta") 667 >>> c = SeqRecord(Seq("AAAAGGT", generic_dna), id="Gamma") 668 >>> d = SeqRecord(Seq("AAAACGT", generic_dna), id="Delta") 669 >>> e = SeqRecord(Seq("AAA-GGT", generic_dna), id="Epsilon") 670 >>> align = MultipleSeqAlignment([a, b, c, d, e], generic_dna) 671 672 You can access a row of the alignment as a SeqRecord using an integer 673 index (think of the alignment as a list of SeqRecord objects here): 674 675 >>> first_record = align[0] 676 >>> print("%s %s" % (first_record.id, first_record.seq)) 677 Alpha AAAACGT 678 >>> last_record = align[-1] 679 >>> print("%s %s" % (last_record.id, last_record.seq)) 680 Epsilon AAA-GGT 681 682 You can also access use python's slice notation to create a sub-alignment 683 containing only some of the SeqRecord objects: 684 685 >>> sub_alignment = align[2:5] 686 >>> print(sub_alignment) 687 DNAAlphabet() alignment with 3 rows and 7 columns 688 AAAAGGT Gamma 689 AAAACGT Delta 690 AAA-GGT Epsilon 691 692 This includes support for a step, i.e. align[start:end:step], which 693 can be used to select every second sequence: 694 695 >>> sub_alignment = align[::2] 696 >>> print(sub_alignment) 697 DNAAlphabet() alignment with 3 rows and 7 columns 698 AAAACGT Alpha 699 AAAAGGT Gamma 700 AAA-GGT Epsilon 701 702 Or to get a copy of the alignment with the rows in reverse order: 703 704 >>> rev_alignment = align[::-1] 705 >>> print(rev_alignment) 706 DNAAlphabet() alignment with 5 rows and 7 columns 707 AAA-GGT Epsilon 708 AAAACGT Delta 709 AAAAGGT Gamma 710 AAA-CGT Beta 711 AAAACGT Alpha 712 713 You can also use two indices to specify both rows and columns. Using simple 714 integers gives you the entry as a single character string. e.g. 715 716 >>> align[3, 4] 717 'C' 718 719 This is equivalent to: 720 721 >>> align[3][4] 722 'C' 723 724 or: 725 726 >>> align[3].seq[4] 727 'C' 728 729 To get a single column (as a string) use this syntax: 730 731 >>> align[:, 4] 732 'CCGCG' 733 734 Or, to get part of a column, 735 736 >>> align[1:3, 4] 737 'CG' 738 739 However, in general you get a sub-alignment, 740 741 >>> print(align[1:5, 3:6]) 742 DNAAlphabet() alignment with 4 rows and 3 columns 743 -CG Beta 744 AGG Gamma 745 ACG Delta 746 -GG Epsilon 747 748 This should all seem familiar to anyone who has used the NumPy 749 array or matrix objects. 750 """ 751 if isinstance(index, int): 752 # e.g. result = align[x] 753 # Return a SeqRecord 754 return self._records[index] 755 elif isinstance(index, slice): 756 # e.g. sub_align = align[i:j:k] 757 return MultipleSeqAlignment(self._records[index], self._alphabet) 758 elif len(index) != 2: 759 raise TypeError("Invalid index type.") 760 761 # Handle double indexing 762 row_index, col_index = index 763 if isinstance(row_index, int): 764 # e.g. row_or_part_row = align[6, 1:4], gives a SeqRecord 765 return self._records[row_index][col_index] 766 elif isinstance(col_index, int): 767 # e.g. col_or_part_col = align[1:5, 6], gives a string 768 return "".join(rec[col_index] for rec in self._records[row_index]) 769 else: 770 # e.g. sub_align = align[1:4, 5:7], gives another alignment 771 return MultipleSeqAlignment((rec[col_index] for rec in self._records[row_index]), 772 self._alphabet)
773
774 - def sort(self, key=None, reverse=False):
775 """Sort the rows (SeqRecord objects) of the alignment in place. 776 777 This sorts the rows alphabetically using the SeqRecord object id by 778 default. The sorting can be controlled by supplying a key function 779 which must map each SeqRecord to a sort value. 780 781 This is useful if you want to add two alignments which use the same 782 record identifiers, but in a different order. For example, 783 784 >>> from Bio.Alphabet import generic_dna 785 >>> from Bio.Seq import Seq 786 >>> from Bio.SeqRecord import SeqRecord 787 >>> from Bio.Align import MultipleSeqAlignment 788 >>> align1 = MultipleSeqAlignment([ 789 ... SeqRecord(Seq("ACGT", generic_dna), id="Human"), 790 ... SeqRecord(Seq("ACGG", generic_dna), id="Mouse"), 791 ... SeqRecord(Seq("ACGC", generic_dna), id="Chicken"), 792 ... ]) 793 >>> align2 = MultipleSeqAlignment([ 794 ... SeqRecord(Seq("CGGT", generic_dna), id="Mouse"), 795 ... SeqRecord(Seq("CGTT", generic_dna), id="Human"), 796 ... SeqRecord(Seq("CGCT", generic_dna), id="Chicken"), 797 ... ]) 798 799 If you simple try and add these without sorting, you get this: 800 801 >>> print(align1 + align2) 802 DNAAlphabet() alignment with 3 rows and 8 columns 803 ACGTCGGT <unknown id> 804 ACGGCGTT <unknown id> 805 ACGCCGCT Chicken 806 807 Consult the SeqRecord documentation which explains why you get a 808 default value when annotation like the identifier doesn't match up. 809 However, if we sort the alignments first, then add them we get the 810 desired result: 811 812 >>> align1.sort() 813 >>> align2.sort() 814 >>> print(align1 + align2) 815 DNAAlphabet() alignment with 3 rows and 8 columns 816 ACGCCGCT Chicken 817 ACGTCGTT Human 818 ACGGCGGT Mouse 819 820 As an example using a different sort order, you could sort on the 821 GC content of each sequence. 822 823 >>> from Bio.SeqUtils import GC 824 >>> print(align1) 825 DNAAlphabet() alignment with 3 rows and 4 columns 826 ACGC Chicken 827 ACGT Human 828 ACGG Mouse 829 >>> align1.sort(key = lambda record: GC(record.seq)) 830 >>> print(align1) 831 DNAAlphabet() alignment with 3 rows and 4 columns 832 ACGT Human 833 ACGC Chicken 834 ACGG Mouse 835 836 There is also a reverse argument, so if you wanted to sort by ID 837 but backwards: 838 839 >>> align1.sort(reverse=True) 840 >>> print(align1) 841 DNAAlphabet() alignment with 3 rows and 4 columns 842 ACGG Mouse 843 ACGT Human 844 ACGC Chicken 845 846 """ 847 if key is None: 848 self._records.sort(key=lambda r: r.id, reverse=reverse) 849 else: 850 self._records.sort(key=key, reverse=reverse)
851 852 853 if __name__ == "__main__": 854 from Bio._utils import run_doctest 855 run_doctest() 856