Package Bio :: Package Align
[hide private]
[frames] | no frames]

Source Code for Package Bio.Align

  1  # Copyright 2008-2011 by Peter Cock. 
  2  # All rights reserved. 
  3  # This code is part of the Biopython distribution and governed by its 
  4  # license.  Please see the LICENSE file that should have been included 
  5  # as part of this package. 
  6  """Code for dealing with sequence alignments. 
  7   
  8  One of the most important things in this module is the MultipleSeqAlignment 
  9  class, used in the Bio.AlignIO module. 
 10   
 11  """ 
 12  from __future__ import print_function 
 13   
 14  from Bio.Seq import Seq 
 15  from Bio.SeqRecord import SeqRecord 
 16  from Bio import Alphabet 
 17   
 18   
19 -class MultipleSeqAlignment(object):
20 """Represents a classical multiple sequence alignment (MSA). 21 22 By this we mean a collection of sequences (usually shown as rows) which 23 are all the same length (usually with gap characters for insertions or 24 padding). The data can then be regarded as a matrix of letters, with well 25 defined columns. 26 27 You would typically create an MSA by loading an alignment file with the 28 AlignIO module: 29 30 >>> from Bio import AlignIO 31 >>> align = AlignIO.read("Clustalw/opuntia.aln", "clustal") 32 >>> print(align) 33 SingleLetterAlphabet() alignment with 7 rows and 156 columns 34 TATACATTAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273285|gb|AF191659.1|AF191 35 TATACATTAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273284|gb|AF191658.1|AF191 36 TATACATTAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273287|gb|AF191661.1|AF191 37 TATACATAAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273286|gb|AF191660.1|AF191 38 TATACATTAAAGGAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273290|gb|AF191664.1|AF191 39 TATACATTAAAGGAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273289|gb|AF191663.1|AF191 40 TATACATTAAAGGAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273291|gb|AF191665.1|AF191 41 42 In some respects you can treat these objects as lists of SeqRecord objects, 43 each representing a row of the alignment. Iterating over an alignment gives 44 the SeqRecord object for each row: 45 46 >>> len(align) 47 7 48 >>> for record in align: 49 ... print("%s %i" % (record.id, len(record))) 50 gi|6273285|gb|AF191659.1|AF191 156 51 gi|6273284|gb|AF191658.1|AF191 156 52 gi|6273287|gb|AF191661.1|AF191 156 53 gi|6273286|gb|AF191660.1|AF191 156 54 gi|6273290|gb|AF191664.1|AF191 156 55 gi|6273289|gb|AF191663.1|AF191 156 56 gi|6273291|gb|AF191665.1|AF191 156 57 58 You can also access individual rows as SeqRecord objects via their index: 59 60 >>> print(align[0].id) 61 gi|6273285|gb|AF191659.1|AF191 62 >>> print(align[-1].id) 63 gi|6273291|gb|AF191665.1|AF191 64 65 And extract columns as strings: 66 67 >>> print(align[:, 1]) 68 AAAAAAA 69 70 Or, take just the first ten columns as a sub-alignment: 71 72 >>> print(align[:, :10]) 73 SingleLetterAlphabet() alignment with 7 rows and 10 columns 74 TATACATTAA gi|6273285|gb|AF191659.1|AF191 75 TATACATTAA gi|6273284|gb|AF191658.1|AF191 76 TATACATTAA gi|6273287|gb|AF191661.1|AF191 77 TATACATAAA gi|6273286|gb|AF191660.1|AF191 78 TATACATTAA gi|6273290|gb|AF191664.1|AF191 79 TATACATTAA gi|6273289|gb|AF191663.1|AF191 80 TATACATTAA gi|6273291|gb|AF191665.1|AF191 81 82 Combining this alignment slicing with alignment addition allows you to 83 remove a section of the alignment. For example, taking just the first 84 and last ten columns: 85 86 >>> print(align[:, :10] + align[:, -10:]) 87 SingleLetterAlphabet() alignment with 7 rows and 20 columns 88 TATACATTAAGTGTACCAGA gi|6273285|gb|AF191659.1|AF191 89 TATACATTAAGTGTACCAGA gi|6273284|gb|AF191658.1|AF191 90 TATACATTAAGTGTACCAGA gi|6273287|gb|AF191661.1|AF191 91 TATACATAAAGTGTACCAGA gi|6273286|gb|AF191660.1|AF191 92 TATACATTAAGTGTACCAGA gi|6273290|gb|AF191664.1|AF191 93 TATACATTAAGTATACCAGA gi|6273289|gb|AF191663.1|AF191 94 TATACATTAAGTGTACCAGA gi|6273291|gb|AF191665.1|AF191 95 96 Note - This object replaced the older Alignment object defined in module 97 Bio.Align.Generic but is not fully backwards compatible with it. 98 99 Note - This object does NOT attempt to model the kind of alignments used 100 in next generation sequencing with multiple sequencing reads which are 101 much shorter than the alignment, and where there is usually a consensus or 102 reference sequence with special status. 103 """ 104
105 - def __init__(self, records, alphabet=None, 106 annotations=None):
107 """Initialize a new MultipleSeqAlignment object. 108 109 Arguments: 110 - records - A list (or iterator) of SeqRecord objects, whose 111 sequences are all the same length. This may be an be an 112 empty list. 113 - alphabet - The alphabet for the whole alignment, typically a gapped 114 alphabet, which should be a super-set of the individual 115 record alphabets. If omitted, a consensus alphabet is 116 used. 117 - annotations - Information about the whole alignment (dictionary). 118 119 You would normally load a MSA from a file using Bio.AlignIO, but you 120 can do this from a list of SeqRecord objects too: 121 122 >>> from Bio.Alphabet import generic_dna 123 >>> from Bio.Seq import Seq 124 >>> from Bio.SeqRecord import SeqRecord 125 >>> from Bio.Align import MultipleSeqAlignment 126 >>> a = SeqRecord(Seq("AAAACGT", generic_dna), id="Alpha") 127 >>> b = SeqRecord(Seq("AAA-CGT", generic_dna), id="Beta") 128 >>> c = SeqRecord(Seq("AAAAGGT", generic_dna), id="Gamma") 129 >>> align = MultipleSeqAlignment([a, b, c], annotations={"tool": "demo"}) 130 >>> print(align) 131 DNAAlphabet() alignment with 3 rows and 7 columns 132 AAAACGT Alpha 133 AAA-CGT Beta 134 AAAAGGT Gamma 135 >>> align.annotations 136 {'tool': 'demo'} 137 138 NOTE - The older Bio.Align.Generic.Alignment class only accepted a 139 single argument, an alphabet. This is still supported via a backwards 140 compatible "hack" so as not to disrupt existing scripts and users, but 141 is deprecated and will be removed in a future release. 142 """ 143 if isinstance(records, (Alphabet.Alphabet, Alphabet.AlphabetEncoder)): 144 if alphabet is None: 145 # TODO - Remove this backwards compatible mode! 146 alphabet = records 147 records = [] 148 import warnings 149 from Bio import BiopythonDeprecationWarning 150 warnings.warn("Invalid records argument: While the old " 151 "Bio.Align.Generic.Alignment class only " 152 "accepted a single argument (the alphabet), the " 153 "newer Bio.Align.MultipleSeqAlignment class " 154 "expects a list/iterator of SeqRecord objects " 155 "(which can be an empty list) and an optional " 156 "alphabet argument", BiopythonDeprecationWarning) 157 else: 158 raise ValueError("Invalid records argument") 159 if alphabet is not None: 160 if not isinstance(alphabet, (Alphabet.Alphabet, Alphabet.AlphabetEncoder)): 161 raise ValueError("Invalid alphabet argument") 162 self._alphabet = alphabet 163 else: 164 # Default while we add sequences, will take a consensus later 165 self._alphabet = Alphabet.single_letter_alphabet 166 167 self._records = [] 168 if records: 169 self.extend(records) 170 if alphabet is None: 171 # No alphabet was given, take a consensus alphabet 172 self._alphabet = Alphabet._consensus_alphabet(rec.seq.alphabet for 173 rec in self._records 174 if rec.seq is not None) 175 176 # Annotations about the whole alignment 177 if annotations is None: 178 annotations = {} 179 elif not isinstance(annotations, dict): 180 raise TypeError("annotations argument should be a dict") 181 self.annotations = annotations
182
183 - def _str_line(self, record, length=50):
184 """Returns a truncated string representation of a SeqRecord (PRIVATE). 185 186 This is a PRIVATE function used by the __str__ method. 187 """ 188 if record.seq.__class__.__name__ == "CodonSeq": 189 if len(record.seq) <= length: 190 return "%s %s" % (record.seq, record.id) 191 else: 192 return "%s...%s %s" \ 193 % (record.seq[:length - 3], record.seq[-3:], record.id) 194 else: 195 if len(record.seq) <= length: 196 return "%s %s" % (record.seq, record.id) 197 else: 198 return "%s...%s %s" \ 199 % (record.seq[:length - 6], record.seq[-3:], record.id)
200
201 - def __str__(self):
202 """Returns a multi-line string summary of the alignment. 203 204 This output is intended to be readable, but large alignments are 205 shown truncated. A maximum of 20 rows (sequences) and 50 columns 206 are shown, with the record identifiers. This should fit nicely on a 207 single screen. e.g. 208 209 >>> from Bio.Alphabet import IUPAC, Gapped 210 >>> from Bio.Align import MultipleSeqAlignment 211 >>> align = MultipleSeqAlignment([], Gapped(IUPAC.unambiguous_dna, "-")) 212 >>> align.add_sequence("Alpha", "ACTGCTAGCTAG") 213 >>> align.add_sequence("Beta", "ACT-CTAGCTAG") 214 >>> align.add_sequence("Gamma", "ACTGCTAGATAG") 215 >>> print(align) 216 Gapped(IUPACUnambiguousDNA(), '-') alignment with 3 rows and 12 columns 217 ACTGCTAGCTAG Alpha 218 ACT-CTAGCTAG Beta 219 ACTGCTAGATAG Gamma 220 221 See also the alignment's format method. 222 """ 223 rows = len(self._records) 224 lines = ["%s alignment with %i rows and %i columns" 225 % (str(self._alphabet), rows, self.get_alignment_length())] 226 if rows <= 20: 227 lines.extend(self._str_line(rec) for rec in self._records) 228 else: 229 lines.extend(self._str_line(rec) for rec in self._records[:18]) 230 lines.append("...") 231 lines.append(self._str_line(self._records[-1])) 232 return "\n".join(lines)
233
234 - def __repr__(self):
235 """Returns a representation of the object for debugging. 236 237 The representation cannot be used with eval() to recreate the object, 238 which is usually possible with simple python ojects. For example: 239 240 <Bio.Align.MultipleSeqAlignment instance (2 records of length 14, 241 SingleLetterAlphabet()) at a3c184c> 242 243 The hex string is the memory address of the object, see help(id). 244 This provides a simple way to visually distinguish alignments of 245 the same size. 246 """ 247 # A doctest for __repr__ would be nice, but __class__ comes out differently 248 # if run via the __main__ trick. 249 return "<%s instance (%i records of length %i, %s) at %x>" % \ 250 (self.__class__, len(self._records), 251 self.get_alignment_length(), repr(self._alphabet), id(self))
252 # This version is useful for doing eval(repr(alignment)), 253 # but it can be VERY long: 254 # return "%s(%s, %s)" \ 255 # % (self.__class__, repr(self._records), repr(self._alphabet)) 256
257 - def format(self, format):
258 """Returns the alignment as a string in the specified file format. 259 260 The format should be a lower case string supported as an output 261 format by Bio.AlignIO (such as "fasta", "clustal", "phylip", 262 "stockholm", etc), which is used to turn the alignment into a 263 string. 264 265 e.g. 266 267 >>> from Bio.Alphabet import IUPAC, Gapped 268 >>> from Bio.Align import MultipleSeqAlignment 269 >>> align = MultipleSeqAlignment([], Gapped(IUPAC.unambiguous_dna, "-")) 270 >>> align.add_sequence("Alpha", "ACTGCTAGCTAG") 271 >>> align.add_sequence("Beta", "ACT-CTAGCTAG") 272 >>> align.add_sequence("Gamma", "ACTGCTAGATAG") 273 >>> print(align.format("fasta")) 274 >Alpha 275 ACTGCTAGCTAG 276 >Beta 277 ACT-CTAGCTAG 278 >Gamma 279 ACTGCTAGATAG 280 <BLANKLINE> 281 >>> print(align.format("phylip")) 282 3 12 283 Alpha ACTGCTAGCT AG 284 Beta ACT-CTAGCT AG 285 Gamma ACTGCTAGAT AG 286 <BLANKLINE> 287 288 For Python 2.6, 3.0 or later see also the built in format() function. 289 """ 290 # See also the __format__ added for Python 2.6 / 3.0, PEP 3101 291 # See also the SeqRecord class and its format() method using Bio.SeqIO 292 return self.__format__(format)
293
294 - def __format__(self, format_spec):
295 """Returns the alignment as a string in the specified file format. 296 297 This method supports the python format() function added in 298 Python 2.6/3.0. The format_spec should be a lower case 299 string supported by Bio.AlignIO as an output file format. 300 See also the alignment's format() method.""" 301 if format_spec: 302 from Bio._py3k import StringIO 303 from Bio import AlignIO 304 handle = StringIO() 305 AlignIO.write([self], handle, format_spec) 306 return handle.getvalue() 307 else: 308 # Follow python convention and default to using __str__ 309 return str(self)
310
311 - def __iter__(self):
312 """Iterate over alignment rows as SeqRecord objects. 313 314 e.g. 315 316 >>> from Bio.Alphabet import IUPAC, Gapped 317 >>> from Bio.Align import MultipleSeqAlignment 318 >>> align = MultipleSeqAlignment([], Gapped(IUPAC.unambiguous_dna, "-")) 319 >>> align.add_sequence("Alpha", "ACTGCTAGCTAG") 320 >>> align.add_sequence("Beta", "ACT-CTAGCTAG") 321 >>> align.add_sequence("Gamma", "ACTGCTAGATAG") 322 >>> for record in align: 323 ... print(record.id) 324 ... print(record.seq) 325 Alpha 326 ACTGCTAGCTAG 327 Beta 328 ACT-CTAGCTAG 329 Gamma 330 ACTGCTAGATAG 331 """ 332 return iter(self._records)
333
334 - def __len__(self):
335 """Returns the number of sequences in the alignment. 336 337 Use len(alignment) to get the number of sequences (i.e. the number of 338 rows), and alignment.get_alignment_length() to get the length of the 339 longest sequence (i.e. the number of columns). 340 341 This is easy to remember if you think of the alignment as being like a 342 list of SeqRecord objects. 343 """ 344 return len(self._records)
345
346 - def get_alignment_length(self):
347 """Return the maximum length of the alignment. 348 349 All objects in the alignment should (hopefully) have the same 350 length. This function will go through and find this length 351 by finding the maximum length of sequences in the alignment. 352 353 >>> from Bio.Alphabet import IUPAC, Gapped 354 >>> from Bio.Align import MultipleSeqAlignment 355 >>> align = MultipleSeqAlignment([], Gapped(IUPAC.unambiguous_dna, "-")) 356 >>> align.add_sequence("Alpha", "ACTGCTAGCTAG") 357 >>> align.add_sequence("Beta", "ACT-CTAGCTAG") 358 >>> align.add_sequence("Gamma", "ACTGCTAGATAG") 359 >>> align.get_alignment_length() 360 12 361 362 If you want to know the number of sequences in the alignment, 363 use len(align) instead: 364 365 >>> len(align) 366 3 367 368 """ 369 max_length = 0 370 371 for record in self._records: 372 if len(record.seq) > max_length: 373 max_length = len(record.seq) 374 375 return max_length
376
377 - def add_sequence(self, descriptor, sequence, start=None, end=None, 378 weight=1.0):
379 """Add a sequence to the alignment. 380 381 This doesn't do any kind of alignment, it just adds in the sequence 382 object, which is assumed to be prealigned with the existing 383 sequences. 384 385 Arguments: 386 - descriptor - The descriptive id of the sequence being added. 387 This will be used as the resulting SeqRecord's 388 .id property (and, for historical compatibility, 389 also the .description property) 390 - sequence - A string with sequence info. 391 - start - You can explicitly set the start point of the sequence. 392 This is useful (at least) for BLAST alignments, which can 393 just be partial alignments of sequences. 394 - end - Specify the end of the sequence, which is important 395 for the same reason as the start. 396 - weight - The weight to place on the sequence in the alignment. 397 By default, all sequences have the same weight. (0.0 => 398 no weight, 1.0 => highest weight) 399 400 In general providing a SeqRecord and calling .append is prefered. 401 """ 402 new_seq = Seq(sequence, self._alphabet) 403 404 # We are now effectively using the SeqRecord's .id as 405 # the primary identifier (e.g. in Bio.SeqIO) so we should 406 # populate it with the descriptor. 407 # For backwards compatibility, also store this in the 408 # SeqRecord's description property. 409 new_record = SeqRecord(new_seq, 410 id=descriptor, 411 description=descriptor) 412 413 # hack! We really need to work out how to deal with annotations 414 # and features in biopython. Right now, I'll just use the 415 # generic annotations dictionary we've got to store the start 416 # and end, but we should think up something better. I don't know 417 # if I'm really a big fan of the LocatableSeq thing they've got 418 # in BioPerl, but I'm not positive what the best thing to do on 419 # this is... 420 if start: 421 new_record.annotations['start'] = start 422 if end: 423 new_record.annotations['end'] = end 424 425 # another hack to add weight information to the sequence 426 new_record.annotations['weight'] = weight 427 428 self._records.append(new_record)
429
430 - def extend(self, records):
431 """Add more SeqRecord objects to the alignment as rows. 432 433 They must all have the same length as the original alignment, and have 434 alphabets compatible with the alignment's alphabet. For example, 435 436 >>> from Bio.Alphabet import generic_dna 437 >>> from Bio.Seq import Seq 438 >>> from Bio.SeqRecord import SeqRecord 439 >>> from Bio.Align import MultipleSeqAlignment 440 >>> a = SeqRecord(Seq("AAAACGT", generic_dna), id="Alpha") 441 >>> b = SeqRecord(Seq("AAA-CGT", generic_dna), id="Beta") 442 >>> c = SeqRecord(Seq("AAAAGGT", generic_dna), id="Gamma") 443 >>> d = SeqRecord(Seq("AAAACGT", generic_dna), id="Delta") 444 >>> e = SeqRecord(Seq("AAA-GGT", generic_dna), id="Epsilon") 445 446 First we create a small alignment (three rows): 447 448 >>> align = MultipleSeqAlignment([a, b, c]) 449 >>> print(align) 450 DNAAlphabet() alignment with 3 rows and 7 columns 451 AAAACGT Alpha 452 AAA-CGT Beta 453 AAAAGGT Gamma 454 455 Now we can extend this alignment with another two rows: 456 457 >>> align.extend([d, e]) 458 >>> print(align) 459 DNAAlphabet() alignment with 5 rows and 7 columns 460 AAAACGT Alpha 461 AAA-CGT Beta 462 AAAAGGT Gamma 463 AAAACGT Delta 464 AAA-GGT Epsilon 465 466 Because the alignment object allows iteration over the rows as 467 SeqRecords, you can use the extend method with a second alignment 468 (provided its sequences have the same length as the original alignment). 469 """ 470 if len(self): 471 # Use the standard method to get the length 472 expected_length = self.get_alignment_length() 473 else: 474 # Take the first record's length 475 records = iter(records) # records arg could be list or iterator 476 try: 477 rec = next(records) 478 except StopIteration: 479 # Special case, no records 480 return 481 expected_length = len(rec) 482 self._append(rec, expected_length) 483 # Now continue to the rest of the records as usual 484 485 for rec in records: 486 self._append(rec, expected_length)
487
488 - def append(self, record):
489 """Add one more SeqRecord object to the alignment as a new row. 490 491 This must have the same length as the original alignment (unless this is 492 the first record), and have an alphabet compatible with the alignment's 493 alphabet. 494 495 >>> from Bio import AlignIO 496 >>> align = AlignIO.read("Clustalw/opuntia.aln", "clustal") 497 >>> print(align) 498 SingleLetterAlphabet() alignment with 7 rows and 156 columns 499 TATACATTAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273285|gb|AF191659.1|AF191 500 TATACATTAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273284|gb|AF191658.1|AF191 501 TATACATTAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273287|gb|AF191661.1|AF191 502 TATACATAAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273286|gb|AF191660.1|AF191 503 TATACATTAAAGGAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273290|gb|AF191664.1|AF191 504 TATACATTAAAGGAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273289|gb|AF191663.1|AF191 505 TATACATTAAAGGAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273291|gb|AF191665.1|AF191 506 >>> len(align) 507 7 508 509 We'll now construct a dummy record to append as an example: 510 511 >>> from Bio.Seq import Seq 512 >>> from Bio.SeqRecord import SeqRecord 513 >>> dummy = SeqRecord(Seq("N"*156), id="dummy") 514 515 Now append this to the alignment, 516 517 >>> align.append(dummy) 518 >>> print(align) 519 SingleLetterAlphabet() alignment with 8 rows and 156 columns 520 TATACATTAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273285|gb|AF191659.1|AF191 521 TATACATTAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273284|gb|AF191658.1|AF191 522 TATACATTAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273287|gb|AF191661.1|AF191 523 TATACATAAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273286|gb|AF191660.1|AF191 524 TATACATTAAAGGAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273290|gb|AF191664.1|AF191 525 TATACATTAAAGGAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273289|gb|AF191663.1|AF191 526 TATACATTAAAGGAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273291|gb|AF191665.1|AF191 527 NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN...NNN dummy 528 >>> len(align) 529 8 530 531 """ 532 if self._records: 533 self._append(record, self.get_alignment_length()) 534 else: 535 self._append(record)
536
537 - def _append(self, record, expected_length=None):
538 """Helper function (PRIVATE).""" 539 if not isinstance(record, SeqRecord): 540 raise TypeError("New sequence is not a SeqRecord object") 541 542 # Currently the get_alignment_length() call is expensive, so we need 543 # to avoid calling it repeatedly for __init__ and extend, hence this 544 # private _append method 545 if expected_length is not None and len(record) != expected_length: 546 # TODO - Use the following more helpful error, but update unit tests 547 # raise ValueError("New sequence is not of length %i" \ 548 # % self.get_alignment_length()) 549 raise ValueError("Sequences must all be the same length") 550 551 # Using not self.alphabet.contains(record.seq.alphabet) needs fixing 552 # for AlphabetEncoders (e.g. gapped versus ungapped). 553 if not Alphabet._check_type_compatible([self._alphabet, record.seq.alphabet]): 554 raise ValueError("New sequence's alphabet is incompatible") 555 self._records.append(record)
556
557 - def __add__(self, other):
558 """Combines two alignments with the same number of rows by adding them. 559 560 If you have two multiple sequence alignments (MSAs), there are two ways to think 561 about adding them - by row or by column. Using the extend method adds by row. 562 Using the addition operator adds by column. For example, 563 564 >>> from Bio.Alphabet import generic_dna 565 >>> from Bio.Seq import Seq 566 >>> from Bio.SeqRecord import SeqRecord 567 >>> from Bio.Align import MultipleSeqAlignment 568 >>> a1 = SeqRecord(Seq("AAAAC", generic_dna), id="Alpha") 569 >>> b1 = SeqRecord(Seq("AAA-C", generic_dna), id="Beta") 570 >>> c1 = SeqRecord(Seq("AAAAG", generic_dna), id="Gamma") 571 >>> a2 = SeqRecord(Seq("GT", generic_dna), id="Alpha") 572 >>> b2 = SeqRecord(Seq("GT", generic_dna), id="Beta") 573 >>> c2 = SeqRecord(Seq("GT", generic_dna), id="Gamma") 574 >>> left = MultipleSeqAlignment([a1, b1, c1], 575 ... annotations={"tool": "demo", "name": "start"}) 576 >>> right = MultipleSeqAlignment([a2, b2, c2], 577 ... annotations={"tool": "demo", "name": "end"}) 578 579 Now, let's look at these two alignments: 580 581 >>> print(left) 582 DNAAlphabet() alignment with 3 rows and 5 columns 583 AAAAC Alpha 584 AAA-C Beta 585 AAAAG Gamma 586 >>> print(right) 587 DNAAlphabet() alignment with 3 rows and 2 columns 588 GT Alpha 589 GT Beta 590 GT Gamma 591 592 And add them: 593 594 >>> combined = left + right 595 >>> print(combined) 596 DNAAlphabet() alignment with 3 rows and 7 columns 597 AAAACGT Alpha 598 AAA-CGT Beta 599 AAAAGGT Gamma 600 601 For this to work, both alignments must have the same number of records (here 602 they both have 3 rows): 603 604 >>> len(left) 605 3 606 >>> len(right) 607 3 608 >>> len(combined) 609 3 610 611 The individual rows are SeqRecord objects, and these can be added together. Refer 612 to the SeqRecord documentation for details of how the annotation is handled. This 613 example is a special case in that both original alignments shared the same names, 614 meaning when the rows are added they also get the same name. 615 616 Any common annotations are preserved, but differing annotation is lost. This is 617 the same behaviour used in the SeqRecord annotations and is designed to prevent 618 accidental propagation of inappropriate values: 619 620 >>> combined.annotations 621 {'tool': 'demo'} 622 623 """ 624 if not isinstance(other, MultipleSeqAlignment): 625 raise NotImplementedError 626 if len(self) != len(other): 627 raise ValueError("When adding two alignments they must have the same length" 628 " (i.e. same number or rows)") 629 alpha = Alphabet._consensus_alphabet([self._alphabet, other._alphabet]) 630 merged = (left + right for left, right in zip(self, other)) 631 # Take any common annotation: 632 annotations = dict() 633 for k, v in self.annotations.items(): 634 if k in other.annotations and other.annotations[k] == v: 635 annotations[k] = v 636 return MultipleSeqAlignment(merged, alpha, annotations)
637
638 - def __getitem__(self, index):
639 """Access part of the alignment. 640 641 Depending on the indices, you can get a SeqRecord object 642 (representing a single row), a Seq object (for a single columns), 643 a string (for a single characters) or another alignment 644 (representing some part or all of the alignment). 645 646 align[r,c] gives a single character as a string 647 align[r] gives a row as a SeqRecord 648 align[r,:] gives a row as a SeqRecord 649 align[:,c] gives a column as a Seq (using the alignment's alphabet) 650 651 align[:] and align[:,:] give a copy of the alignment 652 653 Anything else gives a sub alignment, e.g. 654 align[0:2] or align[0:2,:] uses only row 0 and 1 655 align[:,1:3] uses only columns 1 and 2 656 align[0:2,1:3] uses only rows 0 & 1 and only cols 1 & 2 657 658 We'll use the following example alignment here for illustration: 659 660 >>> from Bio.Alphabet import generic_dna 661 >>> from Bio.Seq import Seq 662 >>> from Bio.SeqRecord import SeqRecord 663 >>> from Bio.Align import MultipleSeqAlignment 664 >>> a = SeqRecord(Seq("AAAACGT", generic_dna), id="Alpha") 665 >>> b = SeqRecord(Seq("AAA-CGT", generic_dna), id="Beta") 666 >>> c = SeqRecord(Seq("AAAAGGT", generic_dna), id="Gamma") 667 >>> d = SeqRecord(Seq("AAAACGT", generic_dna), id="Delta") 668 >>> e = SeqRecord(Seq("AAA-GGT", generic_dna), id="Epsilon") 669 >>> align = MultipleSeqAlignment([a, b, c, d, e], generic_dna) 670 671 You can access a row of the alignment as a SeqRecord using an integer 672 index (think of the alignment as a list of SeqRecord objects here): 673 674 >>> first_record = align[0] 675 >>> print("%s %s" % (first_record.id, first_record.seq)) 676 Alpha AAAACGT 677 >>> last_record = align[-1] 678 >>> print("%s %s" % (last_record.id, last_record.seq)) 679 Epsilon AAA-GGT 680 681 You can also access use python's slice notation to create a sub-alignment 682 containing only some of the SeqRecord objects: 683 684 >>> sub_alignment = align[2:5] 685 >>> print(sub_alignment) 686 DNAAlphabet() alignment with 3 rows and 7 columns 687 AAAAGGT Gamma 688 AAAACGT Delta 689 AAA-GGT Epsilon 690 691 This includes support for a step, i.e. align[start:end:step], which 692 can be used to select every second sequence: 693 694 >>> sub_alignment = align[::2] 695 >>> print(sub_alignment) 696 DNAAlphabet() alignment with 3 rows and 7 columns 697 AAAACGT Alpha 698 AAAAGGT Gamma 699 AAA-GGT Epsilon 700 701 Or to get a copy of the alignment with the rows in reverse order: 702 703 >>> rev_alignment = align[::-1] 704 >>> print(rev_alignment) 705 DNAAlphabet() alignment with 5 rows and 7 columns 706 AAA-GGT Epsilon 707 AAAACGT Delta 708 AAAAGGT Gamma 709 AAA-CGT Beta 710 AAAACGT Alpha 711 712 You can also use two indices to specify both rows and columns. Using simple 713 integers gives you the entry as a single character string. e.g. 714 715 >>> align[3, 4] 716 'C' 717 718 This is equivalent to: 719 720 >>> align[3][4] 721 'C' 722 723 or: 724 725 >>> align[3].seq[4] 726 'C' 727 728 To get a single column (as a string) use this syntax: 729 730 >>> align[:, 4] 731 'CCGCG' 732 733 Or, to get part of a column, 734 735 >>> align[1:3, 4] 736 'CG' 737 738 However, in general you get a sub-alignment, 739 740 >>> print(align[1:5, 3:6]) 741 DNAAlphabet() alignment with 4 rows and 3 columns 742 -CG Beta 743 AGG Gamma 744 ACG Delta 745 -GG Epsilon 746 747 This should all seem familiar to anyone who has used the NumPy 748 array or matrix objects. 749 """ 750 if isinstance(index, int): 751 # e.g. result = align[x] 752 # Return a SeqRecord 753 return self._records[index] 754 elif isinstance(index, slice): 755 # e.g. sub_align = align[i:j:k] 756 return MultipleSeqAlignment(self._records[index], self._alphabet) 757 elif len(index) != 2: 758 raise TypeError("Invalid index type.") 759 760 # Handle double indexing 761 row_index, col_index = index 762 if isinstance(row_index, int): 763 # e.g. row_or_part_row = align[6, 1:4], gives a SeqRecord 764 return self._records[row_index][col_index] 765 elif isinstance(col_index, int): 766 # e.g. col_or_part_col = align[1:5, 6], gives a string 767 return "".join(rec[col_index] for rec in self._records[row_index]) 768 else: 769 # e.g. sub_align = align[1:4, 5:7], gives another alignment 770 return MultipleSeqAlignment((rec[col_index] for rec in self._records[row_index]), 771 self._alphabet)
772
773 - def sort(self, key=None, reverse=False):
774 """Sort the rows (SeqRecord objects) of the alignment in place. 775 776 This sorts the rows alphabetically using the SeqRecord object id by 777 default. The sorting can be controlled by supplying a key function 778 which must map each SeqRecord to a sort value. 779 780 This is useful if you want to add two alignments which use the same 781 record identifiers, but in a different order. For example, 782 783 >>> from Bio.Alphabet import generic_dna 784 >>> from Bio.Seq import Seq 785 >>> from Bio.SeqRecord import SeqRecord 786 >>> from Bio.Align import MultipleSeqAlignment 787 >>> align1 = MultipleSeqAlignment([ 788 ... SeqRecord(Seq("ACGT", generic_dna), id="Human"), 789 ... SeqRecord(Seq("ACGG", generic_dna), id="Mouse"), 790 ... SeqRecord(Seq("ACGC", generic_dna), id="Chicken"), 791 ... ]) 792 >>> align2 = MultipleSeqAlignment([ 793 ... SeqRecord(Seq("CGGT", generic_dna), id="Mouse"), 794 ... SeqRecord(Seq("CGTT", generic_dna), id="Human"), 795 ... SeqRecord(Seq("CGCT", generic_dna), id="Chicken"), 796 ... ]) 797 798 If you simple try and add these without sorting, you get this: 799 800 >>> print(align1 + align2) 801 DNAAlphabet() alignment with 3 rows and 8 columns 802 ACGTCGGT <unknown id> 803 ACGGCGTT <unknown id> 804 ACGCCGCT Chicken 805 806 Consult the SeqRecord documentation which explains why you get a 807 default value when annotation like the identifier doesn't match up. 808 However, if we sort the alignments first, then add them we get the 809 desired result: 810 811 >>> align1.sort() 812 >>> align2.sort() 813 >>> print(align1 + align2) 814 DNAAlphabet() alignment with 3 rows and 8 columns 815 ACGCCGCT Chicken 816 ACGTCGTT Human 817 ACGGCGGT Mouse 818 819 As an example using a different sort order, you could sort on the 820 GC content of each sequence. 821 822 >>> from Bio.SeqUtils import GC 823 >>> print(align1) 824 DNAAlphabet() alignment with 3 rows and 4 columns 825 ACGC Chicken 826 ACGT Human 827 ACGG Mouse 828 >>> align1.sort(key = lambda record: GC(record.seq)) 829 >>> print(align1) 830 DNAAlphabet() alignment with 3 rows and 4 columns 831 ACGT Human 832 ACGC Chicken 833 ACGG Mouse 834 835 There is also a reverse argument, so if you wanted to sort by ID 836 but backwards: 837 838 >>> align1.sort(reverse=True) 839 >>> print(align1) 840 DNAAlphabet() alignment with 3 rows and 4 columns 841 ACGG Mouse 842 ACGT Human 843 ACGC Chicken 844 845 """ 846 if key is None: 847 self._records.sort(key=lambda r: r.id, reverse=reverse) 848 else: 849 self._records.sort(key=key, reverse=reverse)
850 851 852 if __name__ == "__main__": 853 from Bio._utils import run_doctest 854 run_doctest() 855