Package Bio :: Package Align :: Module Generic
[hide private]
[frames] | no frames]

Source Code for Module Bio.Align.Generic

  1  # Copyright 2000-2004 Brad Chapman. 
  2  # Copyright 2001 Iddo Friedberg. 
  3  # Copyright 2007-2016 by Peter Cock. 
  4  # All rights reserved. 
  5  # This code is part of the Biopython distribution and governed by its 
  6  # license.  Please see the LICENSE file that should have been included 
  7  # as part of this package. 
  8  """Classes for generic sequence alignment. 
  9   
 10  Contains classes to deal with generic sequence alignment stuff not 
 11  specific to a particular program or format. 
 12  """ 
 13  from __future__ import print_function 
 14   
 15  from Bio.Seq import Seq 
 16  from Bio.SeqRecord import SeqRecord 
 17  from Bio import Alphabet 
 18   
 19   
20 -class Alignment(object):
21 """Represent a set of alignments (DEPRECATED). 22 23 This is a base class to represent alignments, which can be subclassed 24 to deal with an alignment in a specific format. 25 26 With the introduction of the MultipleSeqAlignment class in Bio.Align, 27 this base class is deprecated and is likely to be removed in future 28 releases of Biopython. 29 """
30 - def __init__(self, alphabet):
31 """Initialize a new Alignment object. 32 33 Arguments: 34 - alphabet - The alphabet to use for the sequence objects that are 35 created. This alphabet must be a gapped type. 36 37 e.g. 38 39 >>> from Bio.Alphabet import IUPAC, Gapped 40 >>> from Bio.Align.Generic import Alignment 41 >>> align = Alignment(Gapped(IUPAC.unambiguous_dna, "-")) 42 >>> align.add_sequence("Alpha", "ACTGCTAGCTAG") 43 >>> align.add_sequence("Beta", "ACT-CTAGCTAG") 44 >>> align.add_sequence("Gamma", "ACTGCTAGATAG") 45 >>> print(align) 46 Gapped(IUPACUnambiguousDNA(), '-') alignment with 3 rows and 12 columns 47 ACTGCTAGCTAG Alpha 48 ACT-CTAGCTAG Beta 49 ACTGCTAGATAG Gamma 50 """ 51 import warnings 52 import Bio 53 warnings.warn("With the introduction of the MultipleSeqAlignment " 54 "class in Bio.Align, this base class is deprecated " 55 "and is likely to be removed in a future release of " 56 "Biopython.", Bio.BiopythonDeprecationWarning) 57 if not (isinstance(alphabet, Alphabet.Alphabet) or 58 isinstance(alphabet, Alphabet.AlphabetEncoder)): 59 raise ValueError("Invalid alphabet argument") 60 self._alphabet = alphabet 61 # hold everything at a list of SeqRecord objects 62 self._records = []
63
64 - def _str_line(self, record, length=50):
65 """Returns a truncated string representation of a SeqRecord (PRIVATE). 66 67 This is a PRIVATE function used by the __str__ method. 68 """ 69 if record.seq.__class__.__name__ == "CodonSeq": 70 if len(record.seq) <= length: 71 return "%s %s" % (record.seq, record.id) 72 else: 73 return "%s...%s %s" \ 74 % (record.seq[:length - 3], record.seq[-3:], record.id) 75 else: 76 if len(record.seq) <= length: 77 return "%s %s" % (record.seq, record.id) 78 else: 79 return "%s...%s %s" \ 80 % (record.seq[:length - 6], record.seq[-3:], record.id)
81
82 - def __str__(self):
83 """Returns a multi-line string summary of the alignment. 84 85 This output is intended to be readable, but large alignments are 86 shown truncated. A maximum of 20 rows (sequences) and 50 columns 87 are shown, with the record identifiers. This should fit nicely on a 88 single screen. e.g. 89 90 >>> from Bio.Alphabet import IUPAC, Gapped 91 >>> from Bio.Align.Generic import Alignment 92 >>> align = Alignment(Gapped(IUPAC.unambiguous_dna, "-")) 93 >>> align.add_sequence("Alpha", "ACTGCTAGCTAG") 94 >>> align.add_sequence("Beta", "ACT-CTAGCTAG") 95 >>> align.add_sequence("Gamma", "ACTGCTAGATAG") 96 >>> print(align) 97 Gapped(IUPACUnambiguousDNA(), '-') alignment with 3 rows and 12 columns 98 ACTGCTAGCTAG Alpha 99 ACT-CTAGCTAG Beta 100 ACTGCTAGATAG Gamma 101 102 See also the alignment's format method. 103 """ 104 rows = len(self._records) 105 lines = ["%s alignment with %i rows and %i columns" 106 % (str(self._alphabet), rows, self.get_alignment_length())] 107 if rows <= 20: 108 lines.extend(self._str_line(rec) for rec in self._records) 109 else: 110 lines.extend(self._str_line(rec) for rec in self._records[:18]) 111 lines.append("...") 112 lines.append(self._str_line(self._records[-1])) 113 return "\n".join(lines)
114
115 - def __repr__(self):
116 """Returns a representation of the object for debugging. 117 118 The representation cannot be used with eval() to recreate the object, 119 which is usually possible with simple python ojects. For example: 120 121 <Bio.Align.Generic.Alignment instance (2 records of length 14, 122 SingleLetterAlphabet()) at a3c184c> 123 124 The hex string is the memory address of the object, see help(id). 125 This provides a simple way to visually distinguish alignments of 126 the same size. 127 """ 128 # A doctest for __repr__ would be nice, but __class__ comes out differently 129 # if run via the __main__ trick. 130 return "<%s instance (%i records of length %i, %s) at %x>" % \ 131 (self.__class__, len(self._records), 132 self.get_alignment_length(), repr(self._alphabet), id(self))
133 # This version is useful for doing eval(repr(alignment)), 134 # but it can be VERY long: 135 # return "%s(%s, %s)" \ 136 # % (self.__class__, repr(self._records), repr(self._alphabet)) 137
138 - def format(self, format):
139 """Returns the alignment as a string in the specified file format. 140 141 The format should be a lower case string supported as an output 142 format by Bio.AlignIO (such as "fasta", "clustal", "phylip", 143 "stockholm", etc), which is used to turn the alignment into a 144 string. 145 146 e.g. 147 148 >>> from Bio.Alphabet import IUPAC, Gapped 149 >>> from Bio.Align.Generic import Alignment 150 >>> align = Alignment(Gapped(IUPAC.unambiguous_dna, "-")) 151 >>> align.add_sequence("Alpha", "ACTGCTAGCTAG") 152 >>> align.add_sequence("Beta", "ACT-CTAGCTAG") 153 >>> align.add_sequence("Gamma", "ACTGCTAGATAG") 154 >>> print(align.format("fasta")) 155 >Alpha 156 ACTGCTAGCTAG 157 >Beta 158 ACT-CTAGCTAG 159 >Gamma 160 ACTGCTAGATAG 161 <BLANKLINE> 162 >>> print(align.format("phylip")) 163 3 12 164 Alpha ACTGCTAGCT AG 165 Beta ACT-CTAGCT AG 166 Gamma ACTGCTAGAT AG 167 <BLANKLINE> 168 169 For Python 2.6, 3.0 or later see also the built in format() function. 170 """ 171 # See also the __format__ added for Python 2.6 / 3.0, PEP 3101 172 # See also the SeqRecord class and its format() method using Bio.SeqIO 173 return self.__format__(format)
174
175 - def __format__(self, format_spec):
176 """Returns the alignment as a string in the specified file format. 177 178 This method supports the python format() function added in 179 Python 2.6/3.0. The format_spec should be a lower case 180 string supported by Bio.AlignIO as an output file format. 181 See also the alignment's format() method.""" 182 if format_spec: 183 from Bio._py3k import StringIO 184 from Bio import AlignIO 185 handle = StringIO() 186 AlignIO.write([self], handle, format_spec) 187 return handle.getvalue() 188 else: 189 # Follow python convention and default to using __str__ 190 return str(self)
191
192 - def get_all_seqs(self):
193 """Return all of the sequences involved in the alignment (DEPRECATED). 194 195 The return value is a list of SeqRecord objects. 196 197 This method is deprecated, as the Alignment object itself now offers 198 much of the functionality of a list of SeqRecord objects (e.g. 199 iteration or slicing to create a sub-alignment). Instead use the 200 Python builtin function list, i.e. my_list = list(my_align) 201 """ 202 import warnings 203 import Bio 204 warnings.warn("This method is deprecated, since the alignment object" 205 "now acts more like a list. Instead of calling " 206 "align.get_all_seqs() you can use list(align)", 207 Bio.BiopythonDeprecationWarning) 208 return self._records
209
210 - def __iter__(self):
211 """Iterate over alignment rows as SeqRecord objects. 212 213 e.g. 214 215 >>> from Bio.Alphabet import IUPAC, Gapped 216 >>> from Bio.Align.Generic import Alignment 217 >>> align = Alignment(Gapped(IUPAC.unambiguous_dna, "-")) 218 >>> align.add_sequence("Alpha", "ACTGCTAGCTAG") 219 >>> align.add_sequence("Beta", "ACT-CTAGCTAG") 220 >>> align.add_sequence("Gamma", "ACTGCTAGATAG") 221 >>> for record in align: 222 ... print(record.id) 223 ... print(record.seq) 224 Alpha 225 ACTGCTAGCTAG 226 Beta 227 ACT-CTAGCTAG 228 Gamma 229 ACTGCTAGATAG 230 """ 231 return iter(self._records)
232
233 - def get_seq_by_num(self, number):
234 """Retrieve a sequence by row number (DEPRECATED). 235 236 Returns: 237 - A Seq object for the requested sequence. 238 239 Raises: 240 - IndexError - If the specified number is out of range. 241 242 NOTE: This is a legacy method. In new code where you need to access 243 the rows of the alignment (i.e. the sequences) consider iterating 244 over them or accessing them as SeqRecord objects. 245 """ 246 import warnings 247 import Bio 248 warnings.warn("This is a legacy method and is likely to be removed " 249 "in a future release of Biopython. In new code where " 250 "you need to access the rows of the alignment (i.e. the " 251 "sequences) consider iterating over them or accessing " 252 "them as SeqRecord objects.", 253 Bio.BiopythonDeprecationWarning) 254 return self._records[number].seq
255
256 - def __len__(self):
257 """Returns the number of sequences in the alignment. 258 259 Use len(alignment) to get the number of sequences (i.e. the number of 260 rows), and alignment.get_alignment_length() to get the length of the 261 longest sequence (i.e. the number of columns). 262 263 This is easy to remember if you think of the alignment as being like a 264 list of SeqRecord objects. 265 """ 266 return len(self._records)
267
268 - def get_alignment_length(self):
269 """Return the maximum length of the alignment. 270 271 All objects in the alignment should (hopefully) have the same 272 length. This function will go through and find this length 273 by finding the maximum length of sequences in the alignment. 274 275 >>> from Bio.Alphabet import IUPAC, Gapped 276 >>> from Bio.Align.Generic import Alignment 277 >>> align = Alignment(Gapped(IUPAC.unambiguous_dna, "-")) 278 >>> align.add_sequence("Alpha", "ACTGCTAGCTAG") 279 >>> align.add_sequence("Beta", "ACT-CTAGCTAG") 280 >>> align.add_sequence("Gamma", "ACTGCTAGATAG") 281 >>> align.get_alignment_length() 282 12 283 284 If you want to know the number of sequences in the alignment, 285 use len(align) instead: 286 287 >>> len(align) 288 3 289 290 """ 291 max_length = 0 292 293 for record in self._records: 294 if len(record.seq) > max_length: 295 max_length = len(record.seq) 296 297 return max_length
298
299 - def add_sequence(self, descriptor, sequence, start=None, end=None, 300 weight=1.0):
301 """Add a sequence to the alignment. 302 303 This doesn't do any kind of alignment, it just adds in the sequence 304 object, which is assumed to be prealigned with the existing 305 sequences. 306 307 Arguments: 308 - descriptor - The descriptive id of the sequence being added. 309 This will be used as the resulting SeqRecord's 310 .id property (and, for historical compatibility, 311 also the .description property) 312 - sequence - A string with sequence info. 313 - start - You can explicitly set the start point of the sequence. 314 This is useful (at least) for BLAST alignments, which can 315 just be partial alignments of sequences. 316 - end - Specify the end of the sequence, which is important 317 for the same reason as the start. 318 - weight - The weight to place on the sequence in the alignment. 319 By default, all sequences have the same weight. (0.0 => 320 no weight, 1.0 => highest weight) 321 """ 322 new_seq = Seq(sequence, self._alphabet) 323 324 # We are now effectively using the SeqRecord's .id as 325 # the primary identifier (e.g. in Bio.SeqIO) so we should 326 # populate it with the descriptor. 327 # For backwards compatibility, also store this in the 328 # SeqRecord's description property. 329 new_record = SeqRecord(new_seq, 330 id=descriptor, 331 description=descriptor) 332 333 # hack! We really need to work out how to deal with annotations 334 # and features in biopython. Right now, I'll just use the 335 # generic annotations dictionary we've got to store the start 336 # and end, but we should think up something better. I don't know 337 # if I'm really a big fan of the LocatableSeq thing they've got 338 # in BioPerl, but I'm not positive what the best thing to do on 339 # this is... 340 if start: 341 new_record.annotations['start'] = start 342 if end: 343 new_record.annotations['end'] = end 344 345 # another hack to add weight information to the sequence 346 new_record.annotations['weight'] = weight 347 348 self._records.append(new_record)
349
350 - def get_column(self, col):
351 """Returns a string containing a given column. 352 353 e.g. 354 355 >>> from Bio.Alphabet import IUPAC, Gapped 356 >>> from Bio.Align.Generic import Alignment 357 >>> align = Alignment(Gapped(IUPAC.unambiguous_dna, "-")) 358 >>> align.add_sequence("Alpha", "ACTGCTAGCTAG") 359 >>> align.add_sequence("Beta", "ACT-CTAGCTAG") 360 >>> align.add_sequence("Gamma", "ACTGCTAGATAG") 361 >>> align.get_column(0) 362 'AAA' 363 >>> align.get_column(3) 364 'G-G' 365 """ 366 # TODO - Support negative indices? 367 col_str = '' 368 assert col >= 0 and col <= self.get_alignment_length() 369 for rec in self._records: 370 col_str += rec.seq[col] 371 return col_str
372
373 - def __getitem__(self, index):
374 """Access part of the alignment. 375 376 We'll use the following example alignment here for illustration: 377 378 >>> from Bio.Alphabet import IUPAC, Gapped 379 >>> from Bio.Align.Generic import Alignment 380 >>> align = Alignment(Gapped(IUPAC.unambiguous_dna, "-")) 381 >>> align.add_sequence("Alpha", "ACTGCTAGCTAG") 382 >>> align.add_sequence("Beta", "ACT-CTAGCTAG") 383 >>> align.add_sequence("Gamma", "ACTGCTAGATAG") 384 >>> align.add_sequence("Delta", "ACTGCTTGCTAG") 385 >>> align.add_sequence("Epsilon", "ACTGCTTGATAG") 386 387 You can access a row of the alignment as a SeqRecord using an integer 388 index (think of the alignment as a list of SeqRecord objects here): 389 390 >>> first_record = align[0] 391 >>> print("%s %s" % (first_record.id, first_record.seq)) 392 Alpha ACTGCTAGCTAG 393 >>> last_record = align[-1] 394 >>> print("%s %s" % (last_record.id, last_record.seq)) 395 Epsilon ACTGCTTGATAG 396 397 You can also access use python's slice notation to create a sub-alignment 398 containing only some of the SeqRecord objects: 399 400 >>> sub_alignment = align[2:5] 401 >>> print(sub_alignment) 402 Gapped(IUPACUnambiguousDNA(), '-') alignment with 3 rows and 12 columns 403 ACTGCTAGATAG Gamma 404 ACTGCTTGCTAG Delta 405 ACTGCTTGATAG Epsilon 406 407 This includes support for a step, i.e. align[start:end:step], which 408 can be used to select every second sequence: 409 410 >>> sub_alignment = align[::2] 411 >>> print(sub_alignment) 412 Gapped(IUPACUnambiguousDNA(), '-') alignment with 3 rows and 12 columns 413 ACTGCTAGCTAG Alpha 414 ACTGCTAGATAG Gamma 415 ACTGCTTGATAG Epsilon 416 417 Or to get a copy of the alignment with the rows in reverse order: 418 419 >>> rev_alignment = align[::-1] 420 >>> print(rev_alignment) 421 Gapped(IUPACUnambiguousDNA(), '-') alignment with 5 rows and 12 columns 422 ACTGCTTGATAG Epsilon 423 ACTGCTTGCTAG Delta 424 ACTGCTAGATAG Gamma 425 ACT-CTAGCTAG Beta 426 ACTGCTAGCTAG Alpha 427 428 Right now, these are the ONLY indexing operations supported. The use of 429 a second column based index is under discussion for a future update. 430 """ 431 if isinstance(index, int): 432 # e.g. result = align[x] 433 # Return a SeqRecord 434 return self._records[index] 435 elif isinstance(index, slice): 436 # e.g. sub_aling = align[i:j:k] 437 # Return a new Alignment using only the specified records. 438 # TODO - See Bug 2554 for changing the __init__ method 439 # to allow us to do this more cleanly. 440 sub_align = Alignment(self._alphabet) 441 sub_align._records = self._records[index] 442 return sub_align 443 elif len(index) == 2: 444 raise TypeError("Row and Column indexing is not currently supported," 445 "but may be in future.") 446 else: 447 raise TypeError("Invalid index type.")
448 449 450 if __name__ == "__main__": 451 from Bio._utils import run_doctest 452 run_doctest() 453