Package Bio :: Package Align :: Module Generic
[hide private]
[frames] | no frames]

Source Code for Module Bio.Align.Generic

  1  # Copyright 2000-2004 Brad Chapman. 
  2  # Copyright 2001 Iddo Friedberg. 
  3  # Copyright 2007-2016 by Peter Cock. 
  4  # All rights reserved. 
  5  # This code is part of the Biopython distribution and governed by its 
  6  # license.  Please see the LICENSE file that should have been included 
  7  # as part of this package. 
  8  """Classes for generic sequence alignment. 
  9   
 10  Contains classes to deal with generic sequence alignment stuff not 
 11  specific to a particular program or format. 
 12  """ 
 13  from __future__ import print_function 
 14   
 15  from Bio.Seq import Seq 
 16  from Bio.SeqRecord import SeqRecord 
 17  from Bio import Alphabet 
 18   
 19   
20 -class Alignment(object):
21 """Represent a set of alignments (DEPRECATED). 22 23 This is a base class to represent alignments, which can be subclassed 24 to deal with an alignment in a specific format. 25 26 With the introduction of the MultipleSeqAlignment class in Bio.Align, 27 this base class is deprecated and is likely to be removed in future 28 releases of Biopython. 29 """
30 - def __init__(self, alphabet):
31 """Initialize a new Alignment object. 32 33 Arguments: 34 - alphabet - The alphabet to use for the sequence objects that are 35 created. This alphabet must be a gapped type. 36 37 e.g. 38 39 >>> from Bio.Alphabet import IUPAC, Gapped 40 >>> from Bio.Align.Generic import Alignment 41 >>> align = Alignment(Gapped(IUPAC.unambiguous_dna, "-")) 42 >>> align.add_sequence("Alpha", "ACTGCTAGCTAG") 43 >>> align.add_sequence("Beta", "ACT-CTAGCTAG") 44 >>> align.add_sequence("Gamma", "ACTGCTAGATAG") 45 >>> print(align) 46 Gapped(IUPACUnambiguousDNA(), '-') alignment with 3 rows and 12 columns 47 ACTGCTAGCTAG Alpha 48 ACT-CTAGCTAG Beta 49 ACTGCTAGATAG Gamma 50 """ 51 import warnings 52 import Bio 53 warnings.warn("With the introduction of the MultipleSeqAlignment class in Bio.Align, this base class is deprecated and is likely to be removed in a future release of Biopython.", Bio.BiopythonDeprecationWarning) 54 if not (isinstance(alphabet, Alphabet.Alphabet) or 55 isinstance(alphabet, Alphabet.AlphabetEncoder)): 56 raise ValueError("Invalid alphabet argument") 57 self._alphabet = alphabet 58 # hold everything at a list of SeqRecord objects 59 self._records = []
60
61 - def _str_line(self, record, length=50):
62 """Returns a truncated string representation of a SeqRecord (PRIVATE). 63 64 This is a PRIVATE function used by the __str__ method. 65 """ 66 if record.seq.__class__.__name__ == "CodonSeq": 67 if len(record.seq) <= length: 68 return "%s %s" % (record.seq, record.id) 69 else: 70 return "%s...%s %s" \ 71 % (record.seq[:length - 3], record.seq[-3:], record.id) 72 else: 73 if len(record.seq) <= length: 74 return "%s %s" % (record.seq, record.id) 75 else: 76 return "%s...%s %s" \ 77 % (record.seq[:length - 6], record.seq[-3:], record.id)
78
79 - def __str__(self):
80 """Returns a multi-line string summary of the alignment. 81 82 This output is intended to be readable, but large alignments are 83 shown truncated. A maximum of 20 rows (sequences) and 50 columns 84 are shown, with the record identifiers. This should fit nicely on a 85 single screen. e.g. 86 87 >>> from Bio.Alphabet import IUPAC, Gapped 88 >>> from Bio.Align.Generic import Alignment 89 >>> align = Alignment(Gapped(IUPAC.unambiguous_dna, "-")) 90 >>> align.add_sequence("Alpha", "ACTGCTAGCTAG") 91 >>> align.add_sequence("Beta", "ACT-CTAGCTAG") 92 >>> align.add_sequence("Gamma", "ACTGCTAGATAG") 93 >>> print(align) 94 Gapped(IUPACUnambiguousDNA(), '-') alignment with 3 rows and 12 columns 95 ACTGCTAGCTAG Alpha 96 ACT-CTAGCTAG Beta 97 ACTGCTAGATAG Gamma 98 99 See also the alignment's format method. 100 """ 101 rows = len(self._records) 102 lines = ["%s alignment with %i rows and %i columns" 103 % (str(self._alphabet), rows, self.get_alignment_length())] 104 if rows <= 20: 105 lines.extend(self._str_line(rec) for rec in self._records) 106 else: 107 lines.extend(self._str_line(rec) for rec in self._records[:18]) 108 lines.append("...") 109 lines.append(self._str_line(self._records[-1])) 110 return "\n".join(lines)
111
112 - def __repr__(self):
113 """Returns a representation of the object for debugging. 114 115 The representation cannot be used with eval() to recreate the object, 116 which is usually possible with simple python ojects. For example: 117 118 <Bio.Align.Generic.Alignment instance (2 records of length 14, 119 SingleLetterAlphabet()) at a3c184c> 120 121 The hex string is the memory address of the object, see help(id). 122 This provides a simple way to visually distinguish alignments of 123 the same size. 124 """ 125 # A doctest for __repr__ would be nice, but __class__ comes out differently 126 # if run via the __main__ trick. 127 return "<%s instance (%i records of length %i, %s) at %x>" % \ 128 (self.__class__, len(self._records), 129 self.get_alignment_length(), repr(self._alphabet), id(self))
130 # This version is useful for doing eval(repr(alignment)), 131 # but it can be VERY long: 132 # return "%s(%s, %s)" \ 133 # % (self.__class__, repr(self._records), repr(self._alphabet)) 134
135 - def format(self, format):
136 """Returns the alignment as a string in the specified file format. 137 138 The format should be a lower case string supported as an output 139 format by Bio.AlignIO (such as "fasta", "clustal", "phylip", 140 "stockholm", etc), which is used to turn the alignment into a 141 string. 142 143 e.g. 144 145 >>> from Bio.Alphabet import IUPAC, Gapped 146 >>> from Bio.Align.Generic import Alignment 147 >>> align = Alignment(Gapped(IUPAC.unambiguous_dna, "-")) 148 >>> align.add_sequence("Alpha", "ACTGCTAGCTAG") 149 >>> align.add_sequence("Beta", "ACT-CTAGCTAG") 150 >>> align.add_sequence("Gamma", "ACTGCTAGATAG") 151 >>> print(align.format("fasta")) 152 >Alpha 153 ACTGCTAGCTAG 154 >Beta 155 ACT-CTAGCTAG 156 >Gamma 157 ACTGCTAGATAG 158 <BLANKLINE> 159 >>> print(align.format("phylip")) 160 3 12 161 Alpha ACTGCTAGCT AG 162 Beta ACT-CTAGCT AG 163 Gamma ACTGCTAGAT AG 164 <BLANKLINE> 165 166 For Python 2.6, 3.0 or later see also the built in format() function. 167 """ 168 # See also the __format__ added for Python 2.6 / 3.0, PEP 3101 169 # See also the SeqRecord class and its format() method using Bio.SeqIO 170 return self.__format__(format)
171
172 - def __format__(self, format_spec):
173 """Returns the alignment as a string in the specified file format. 174 175 This method supports the python format() function added in 176 Python 2.6/3.0. The format_spec should be a lower case 177 string supported by Bio.AlignIO as an output file format. 178 See also the alignment's format() method.""" 179 if format_spec: 180 from Bio._py3k import StringIO 181 from Bio import AlignIO 182 handle = StringIO() 183 AlignIO.write([self], handle, format_spec) 184 return handle.getvalue() 185 else: 186 # Follow python convention and default to using __str__ 187 return str(self)
188
189 - def get_all_seqs(self):
190 """Return all of the sequences involved in the alignment (DEPRECATED). 191 192 The return value is a list of SeqRecord objects. 193 194 This method is deprecated, as the Alignment object itself now offers 195 much of the functionality of a list of SeqRecord objects (e.g. 196 iteration or slicing to create a sub-alignment). Instead use the 197 Python builtin function list, i.e. my_list = list(my_align) 198 """ 199 import warnings 200 import Bio 201 warnings.warn("This method is deprecated, since the alignment object" 202 "now acts more like a list. Instead of calling " 203 "align.get_all_seqs() you can use list(align)", 204 Bio.BiopythonDeprecationWarning) 205 return self._records
206
207 - def __iter__(self):
208 """Iterate over alignment rows as SeqRecord objects. 209 210 e.g. 211 212 >>> from Bio.Alphabet import IUPAC, Gapped 213 >>> from Bio.Align.Generic import Alignment 214 >>> align = Alignment(Gapped(IUPAC.unambiguous_dna, "-")) 215 >>> align.add_sequence("Alpha", "ACTGCTAGCTAG") 216 >>> align.add_sequence("Beta", "ACT-CTAGCTAG") 217 >>> align.add_sequence("Gamma", "ACTGCTAGATAG") 218 >>> for record in align: 219 ... print(record.id) 220 ... print(record.seq) 221 Alpha 222 ACTGCTAGCTAG 223 Beta 224 ACT-CTAGCTAG 225 Gamma 226 ACTGCTAGATAG 227 """ 228 return iter(self._records)
229
230 - def get_seq_by_num(self, number):
231 """Retrieve a sequence by row number (DEPRECATED). 232 233 Returns: 234 - A Seq object for the requested sequence. 235 236 Raises: 237 - IndexError - If the specified number is out of range. 238 239 NOTE: This is a legacy method. In new code where you need to access 240 the rows of the alignment (i.e. the sequences) consider iterating 241 over them or accessing them as SeqRecord objects. 242 """ 243 import warnings 244 import Bio 245 warnings.warn("This is a legacy method and is likely to be removed in a future release of Biopython. In new code where you need to access the rows of the alignment (i.e. the sequences) consider iterating over them or accessing them as SeqRecord objects.", Bio.BiopythonDeprecationWarning) 246 return self._records[number].seq
247
248 - def __len__(self):
249 """Returns the number of sequences in the alignment. 250 251 Use len(alignment) to get the number of sequences (i.e. the number of 252 rows), and alignment.get_alignment_length() to get the length of the 253 longest sequence (i.e. the number of columns). 254 255 This is easy to remember if you think of the alignment as being like a 256 list of SeqRecord objects. 257 """ 258 return len(self._records)
259
260 - def get_alignment_length(self):
261 """Return the maximum length of the alignment. 262 263 All objects in the alignment should (hopefully) have the same 264 length. This function will go through and find this length 265 by finding the maximum length of sequences in the alignment. 266 267 >>> from Bio.Alphabet import IUPAC, Gapped 268 >>> from Bio.Align.Generic import Alignment 269 >>> align = Alignment(Gapped(IUPAC.unambiguous_dna, "-")) 270 >>> align.add_sequence("Alpha", "ACTGCTAGCTAG") 271 >>> align.add_sequence("Beta", "ACT-CTAGCTAG") 272 >>> align.add_sequence("Gamma", "ACTGCTAGATAG") 273 >>> align.get_alignment_length() 274 12 275 276 If you want to know the number of sequences in the alignment, 277 use len(align) instead: 278 279 >>> len(align) 280 3 281 282 """ 283 max_length = 0 284 285 for record in self._records: 286 if len(record.seq) > max_length: 287 max_length = len(record.seq) 288 289 return max_length
290
291 - def add_sequence(self, descriptor, sequence, start=None, end=None, 292 weight=1.0):
293 """Add a sequence to the alignment. 294 295 This doesn't do any kind of alignment, it just adds in the sequence 296 object, which is assumed to be prealigned with the existing 297 sequences. 298 299 Arguments: 300 - descriptor - The descriptive id of the sequence being added. 301 This will be used as the resulting SeqRecord's 302 .id property (and, for historical compatibility, 303 also the .description property) 304 - sequence - A string with sequence info. 305 - start - You can explicitly set the start point of the sequence. 306 This is useful (at least) for BLAST alignments, which can 307 just be partial alignments of sequences. 308 - end - Specify the end of the sequence, which is important 309 for the same reason as the start. 310 - weight - The weight to place on the sequence in the alignment. 311 By default, all sequences have the same weight. (0.0 => 312 no weight, 1.0 => highest weight) 313 """ 314 new_seq = Seq(sequence, self._alphabet) 315 316 # We are now effectively using the SeqRecord's .id as 317 # the primary identifier (e.g. in Bio.SeqIO) so we should 318 # populate it with the descriptor. 319 # For backwards compatibility, also store this in the 320 # SeqRecord's description property. 321 new_record = SeqRecord(new_seq, 322 id=descriptor, 323 description=descriptor) 324 325 # hack! We really need to work out how to deal with annotations 326 # and features in biopython. Right now, I'll just use the 327 # generic annotations dictionary we've got to store the start 328 # and end, but we should think up something better. I don't know 329 # if I'm really a big fan of the LocatableSeq thing they've got 330 # in BioPerl, but I'm not positive what the best thing to do on 331 # this is... 332 if start: 333 new_record.annotations['start'] = start 334 if end: 335 new_record.annotations['end'] = end 336 337 # another hack to add weight information to the sequence 338 new_record.annotations['weight'] = weight 339 340 self._records.append(new_record)
341
342 - def get_column(self, col):
343 """Returns a string containing a given column. 344 345 e.g. 346 347 >>> from Bio.Alphabet import IUPAC, Gapped 348 >>> from Bio.Align.Generic import Alignment 349 >>> align = Alignment(Gapped(IUPAC.unambiguous_dna, "-")) 350 >>> align.add_sequence("Alpha", "ACTGCTAGCTAG") 351 >>> align.add_sequence("Beta", "ACT-CTAGCTAG") 352 >>> align.add_sequence("Gamma", "ACTGCTAGATAG") 353 >>> align.get_column(0) 354 'AAA' 355 >>> align.get_column(3) 356 'G-G' 357 """ 358 # TODO - Support negative indices? 359 col_str = '' 360 assert col >= 0 and col <= self.get_alignment_length() 361 for rec in self._records: 362 col_str += rec.seq[col] 363 return col_str
364
365 - def __getitem__(self, index):
366 """Access part of the alignment. 367 368 We'll use the following example alignment here for illustration: 369 370 >>> from Bio.Alphabet import IUPAC, Gapped 371 >>> from Bio.Align.Generic import Alignment 372 >>> align = Alignment(Gapped(IUPAC.unambiguous_dna, "-")) 373 >>> align.add_sequence("Alpha", "ACTGCTAGCTAG") 374 >>> align.add_sequence("Beta", "ACT-CTAGCTAG") 375 >>> align.add_sequence("Gamma", "ACTGCTAGATAG") 376 >>> align.add_sequence("Delta", "ACTGCTTGCTAG") 377 >>> align.add_sequence("Epsilon", "ACTGCTTGATAG") 378 379 You can access a row of the alignment as a SeqRecord using an integer 380 index (think of the alignment as a list of SeqRecord objects here): 381 382 >>> first_record = align[0] 383 >>> print("%s %s" % (first_record.id, first_record.seq)) 384 Alpha ACTGCTAGCTAG 385 >>> last_record = align[-1] 386 >>> print("%s %s" % (last_record.id, last_record.seq)) 387 Epsilon ACTGCTTGATAG 388 389 You can also access use python's slice notation to create a sub-alignment 390 containing only some of the SeqRecord objects: 391 392 >>> sub_alignment = align[2:5] 393 >>> print(sub_alignment) 394 Gapped(IUPACUnambiguousDNA(), '-') alignment with 3 rows and 12 columns 395 ACTGCTAGATAG Gamma 396 ACTGCTTGCTAG Delta 397 ACTGCTTGATAG Epsilon 398 399 This includes support for a step, i.e. align[start:end:step], which 400 can be used to select every second sequence: 401 402 >>> sub_alignment = align[::2] 403 >>> print(sub_alignment) 404 Gapped(IUPACUnambiguousDNA(), '-') alignment with 3 rows and 12 columns 405 ACTGCTAGCTAG Alpha 406 ACTGCTAGATAG Gamma 407 ACTGCTTGATAG Epsilon 408 409 Or to get a copy of the alignment with the rows in reverse order: 410 411 >>> rev_alignment = align[::-1] 412 >>> print(rev_alignment) 413 Gapped(IUPACUnambiguousDNA(), '-') alignment with 5 rows and 12 columns 414 ACTGCTTGATAG Epsilon 415 ACTGCTTGCTAG Delta 416 ACTGCTAGATAG Gamma 417 ACT-CTAGCTAG Beta 418 ACTGCTAGCTAG Alpha 419 420 Right now, these are the ONLY indexing operations supported. The use of 421 a second column based index is under discussion for a future update. 422 """ 423 if isinstance(index, int): 424 # e.g. result = align[x] 425 # Return a SeqRecord 426 return self._records[index] 427 elif isinstance(index, slice): 428 # e.g. sub_aling = align[i:j:k] 429 # Return a new Alignment using only the specified records. 430 # TODO - See Bug 2554 for changing the __init__ method 431 # to allow us to do this more cleanly. 432 sub_align = Alignment(self._alphabet) 433 sub_align._records = self._records[index] 434 return sub_align 435 elif len(index) == 2: 436 raise TypeError("Row and Column indexing is not currently supported," 437 "but may be in future.") 438 else: 439 raise TypeError("Invalid index type.")
440 441 442 if __name__ == "__main__": 443 from Bio._utils import run_doctest 444 run_doctest() 445