Package Bio :: Package Align :: Module Generic
[hide private]
[frames] | no frames]

Source Code for Module Bio.Align.Generic

  1  # Copyright 2000-2004 Brad Chapman. 
  2  # Copyright 2001 Iddo Friedberg. 
  3  # Copyright 2007-2010 by Peter Cock. 
  4  # All rights reserved. 
  5  # This code is part of the Biopython distribution and governed by its 
  6  # license.  Please see the LICENSE file that should have been included 
  7  # as part of this package. 
  8  """Classes for generic sequence alignment. 
  9   
 10  Contains classes to deal with generic sequence alignment stuff not 
 11  specific to a particular program or format. 
 12   
 13  Classes: 
 14   - Alignment 
 15  """ 
 16  from __future__ import print_function 
 17   
 18  __docformat__ = "epytext en"  # Don't just use plain text in epydoc API pages! 
 19   
 20  # biopython 
 21  from Bio.Seq import Seq 
 22  from Bio.SeqRecord import SeqRecord 
 23  from Bio import Alphabet 
 24   
 25   
26 -class Alignment(object):
27 """Represent a set of alignments (DEPRECATED). 28 29 This is a base class to represent alignments, which can be subclassed 30 to deal with an alignment in a specific format. 31 32 With the introduction of the MultipleSeqAlignment class in Bio.Align, 33 this base class is deprecated and is likely to be removed in future 34 releases of Biopython. 35 """
36 - def __init__(self, alphabet):
37 """Initialize a new Alignment object. 38 39 Arguments: 40 - alphabet - The alphabet to use for the sequence objects that are 41 created. This alphabet must be a gapped type. 42 43 e.g. 44 45 >>> from Bio.Alphabet import IUPAC, Gapped 46 >>> align = Alignment(Gapped(IUPAC.unambiguous_dna, "-")) 47 >>> align.add_sequence("Alpha", "ACTGCTAGCTAG") 48 >>> align.add_sequence("Beta", "ACT-CTAGCTAG") 49 >>> align.add_sequence("Gamma", "ACTGCTAGATAG") 50 >>> print(align) 51 Gapped(IUPACUnambiguousDNA(), '-') alignment with 3 rows and 12 columns 52 ACTGCTAGCTAG Alpha 53 ACT-CTAGCTAG Beta 54 ACTGCTAGATAG Gamma 55 """ 56 import warnings 57 import Bio 58 warnings.warn("With the introduction of the MultipleSeqAlignment class in Bio.Align, this base class is deprecated and is likely to be removed in a future release of Biopython.", Bio.BiopythonDeprecationWarning) 59 if not (isinstance(alphabet, Alphabet.Alphabet) 60 or isinstance(alphabet, Alphabet.AlphabetEncoder)): 61 raise ValueError("Invalid alphabet argument") 62 self._alphabet = alphabet 63 # hold everything at a list of SeqRecord objects 64 self._records = []
65
66 - def _str_line(self, record):
67 """Returns a truncated string representation of a SeqRecord (PRIVATE). 68 69 This is a PRIVATE function used by the __str__ method. 70 """ 71 if len(record.seq) <= 50: 72 return "%s %s" % (record.seq, record.id) 73 else: 74 return "%s...%s %s" \ 75 % (record.seq[:44], record.seq[-3:], record.id)
76
77 - def __str__(self):
78 """Returns a multi-line string summary of the alignment. 79 80 This output is intended to be readable, but large alignments are 81 shown truncated. A maximum of 20 rows (sequences) and 50 columns 82 are shown, with the record identifiers. This should fit nicely on a 83 single screen. e.g. 84 85 >>> from Bio.Alphabet import IUPAC, Gapped 86 >>> align = Alignment(Gapped(IUPAC.unambiguous_dna, "-")) 87 >>> align.add_sequence("Alpha", "ACTGCTAGCTAG") 88 >>> align.add_sequence("Beta", "ACT-CTAGCTAG") 89 >>> align.add_sequence("Gamma", "ACTGCTAGATAG") 90 >>> print(align) 91 Gapped(IUPACUnambiguousDNA(), '-') alignment with 3 rows and 12 columns 92 ACTGCTAGCTAG Alpha 93 ACT-CTAGCTAG Beta 94 ACTGCTAGATAG Gamma 95 96 See also the alignment's format method. 97 """ 98 rows = len(self._records) 99 lines = ["%s alignment with %i rows and %i columns" 100 % (str(self._alphabet), rows, self.get_alignment_length())] 101 if rows <= 20: 102 lines.extend(self._str_line(rec) for rec in self._records) 103 else: 104 lines.extend(self._str_line(rec) for rec in self._records[:18]) 105 lines.append("...") 106 lines.append(self._str_line(self._records[-1])) 107 return "\n".join(lines)
108
109 - def __repr__(self):
110 """Returns a representation of the object for debugging. 111 112 The representation cannot be used with eval() to recreate the object, 113 which is usually possible with simple python ojects. For example: 114 115 <Bio.Align.Generic.Alignment instance (2 records of length 14, 116 SingleLetterAlphabet()) at a3c184c> 117 118 The hex string is the memory address of the object, see help(id). 119 This provides a simple way to visually distinguish alignments of 120 the same size. 121 """ 122 #A doctest for __repr__ would be nice, but __class__ comes out differently 123 #if run via the __main__ trick. 124 return "<%s instance (%i records of length %i, %s) at %x>" % \ 125 (self.__class__, len(self._records), 126 self.get_alignment_length(), repr(self._alphabet), id(self))
127 #This version is useful for doing eval(repr(alignment)), 128 #but it can be VERY long: 129 #return "%s(%s, %s)" \ 130 # % (self.__class__, repr(self._records), repr(self._alphabet)) 131
132 - def format(self, format):
133 """Returns the alignment as a string in the specified file format. 134 135 The format should be a lower case string supported as an output 136 format by Bio.AlignIO (such as "fasta", "clustal", "phylip", 137 "stockholm", etc), which is used to turn the alignment into a 138 string. 139 140 e.g. 141 142 >>> from Bio.Alphabet import IUPAC, Gapped 143 >>> align = Alignment(Gapped(IUPAC.unambiguous_dna, "-")) 144 >>> align.add_sequence("Alpha", "ACTGCTAGCTAG") 145 >>> align.add_sequence("Beta", "ACT-CTAGCTAG") 146 >>> align.add_sequence("Gamma", "ACTGCTAGATAG") 147 >>> print(align.format("fasta")) 148 >Alpha 149 ACTGCTAGCTAG 150 >Beta 151 ACT-CTAGCTAG 152 >Gamma 153 ACTGCTAGATAG 154 <BLANKLINE> 155 >>> print(align.format("phylip")) 156 3 12 157 Alpha ACTGCTAGCT AG 158 Beta ACT-CTAGCT AG 159 Gamma ACTGCTAGAT AG 160 <BLANKLINE> 161 162 For Python 2.6, 3.0 or later see also the built in format() function. 163 """ 164 #See also the __format__ added for Python 2.6 / 3.0, PEP 3101 165 #See also the SeqRecord class and its format() method using Bio.SeqIO 166 return self.__format__(format)
167
168 - def __format__(self, format_spec):
169 """Returns the alignment as a string in the specified file format. 170 171 This method supports the python format() function added in 172 Python 2.6/3.0. The format_spec should be a lower case 173 string supported by Bio.AlignIO as an output file format. 174 See also the alignment's format() method.""" 175 if format_spec: 176 from Bio._py3k import StringIO 177 from Bio import AlignIO 178 handle = StringIO() 179 AlignIO.write([self], handle, format_spec) 180 return handle.getvalue() 181 else: 182 #Follow python convention and default to using __str__ 183 return str(self)
184
185 - def get_all_seqs(self):
186 """Return all of the sequences involved in the alignment (DEPRECATED). 187 188 The return value is a list of SeqRecord objects. 189 190 This method is deprecated, as the Alignment object itself now offers 191 much of the functionality of a list of SeqRecord objects (e.g. 192 iteration or slicing to create a sub-alignment). Instead use the 193 Python builtin function list, i.e. my_list = list(my_align) 194 """ 195 import warnings 196 import Bio 197 warnings.warn("This method is deprecated, since the alignment object" 198 "now acts more like a list. Instead of calling " 199 "align.get_all_seqs() you can use list(align)", 200 Bio.BiopythonDeprecationWarning) 201 return self._records
202
203 - def __iter__(self):
204 """Iterate over alignment rows as SeqRecord objects. 205 206 e.g. 207 208 >>> from Bio.Alphabet import IUPAC, Gapped 209 >>> align = Alignment(Gapped(IUPAC.unambiguous_dna, "-")) 210 >>> align.add_sequence("Alpha", "ACTGCTAGCTAG") 211 >>> align.add_sequence("Beta", "ACT-CTAGCTAG") 212 >>> align.add_sequence("Gamma", "ACTGCTAGATAG") 213 >>> for record in align: 214 ... print(record.id) 215 ... print(record.seq) 216 Alpha 217 ACTGCTAGCTAG 218 Beta 219 ACT-CTAGCTAG 220 Gamma 221 ACTGCTAGATAG 222 """ 223 return iter(self._records)
224
225 - def get_seq_by_num(self, number):
226 """Retrieve a sequence by row number (DEPRECATED). 227 228 Returns: 229 - A Seq object for the requested sequence. 230 231 Raises: 232 - IndexError - If the specified number is out of range. 233 234 NOTE: This is a legacy method. In new code where you need to access 235 the rows of the alignment (i.e. the sequences) consider iterating 236 over them or accessing them as SeqRecord objects. 237 """ 238 import warnings 239 import Bio 240 warnings.warn("This is a legacy method and is likely to be removed in a future release of Biopython. In new code where you need to access the rows of the alignment (i.e. the sequences) consider iterating over them or accessing them as SeqRecord objects.", Bio.BiopythonDeprecationWarning) 241 return self._records[number].seq
242
243 - def __len__(self):
244 """Returns the number of sequences in the alignment. 245 246 Use len(alignment) to get the number of sequences (i.e. the number of 247 rows), and alignment.get_alignment_length() to get the length of the 248 longest sequence (i.e. the number of columns). 249 250 This is easy to remember if you think of the alignment as being like a 251 list of SeqRecord objects. 252 """ 253 return len(self._records)
254
255 - def get_alignment_length(self):
256 """Return the maximum length of the alignment. 257 258 All objects in the alignment should (hopefully) have the same 259 length. This function will go through and find this length 260 by finding the maximum length of sequences in the alignment. 261 262 >>> from Bio.Alphabet import IUPAC, Gapped 263 >>> align = Alignment(Gapped(IUPAC.unambiguous_dna, "-")) 264 >>> align.add_sequence("Alpha", "ACTGCTAGCTAG") 265 >>> align.add_sequence("Beta", "ACT-CTAGCTAG") 266 >>> align.add_sequence("Gamma", "ACTGCTAGATAG") 267 >>> align.get_alignment_length() 268 12 269 270 If you want to know the number of sequences in the alignment, 271 use len(align) instead: 272 273 >>> len(align) 274 3 275 276 """ 277 max_length = 0 278 279 for record in self._records: 280 if len(record.seq) > max_length: 281 max_length = len(record.seq) 282 283 return max_length
284
285 - def add_sequence(self, descriptor, sequence, start = None, end = None, 286 weight = 1.0):
287 """Add a sequence to the alignment. 288 289 This doesn't do any kind of alignment, it just adds in the sequence 290 object, which is assumed to be prealigned with the existing 291 sequences. 292 293 Arguments: 294 - descriptor - The descriptive id of the sequence being added. 295 This will be used as the resulting SeqRecord's 296 .id property (and, for historical compatibility, 297 also the .description property) 298 - sequence - A string with sequence info. 299 - start - You can explicitly set the start point of the sequence. 300 This is useful (at least) for BLAST alignments, which can 301 just be partial alignments of sequences. 302 - end - Specify the end of the sequence, which is important 303 for the same reason as the start. 304 - weight - The weight to place on the sequence in the alignment. 305 By default, all sequences have the same weight. (0.0 => 306 no weight, 1.0 => highest weight) 307 """ 308 new_seq = Seq(sequence, self._alphabet) 309 310 #We are now effectively using the SeqRecord's .id as 311 #the primary identifier (e.g. in Bio.SeqIO) so we should 312 #populate it with the descriptor. 313 #For backwards compatibility, also store this in the 314 #SeqRecord's description property. 315 new_record = SeqRecord(new_seq, 316 id = descriptor, 317 description = descriptor) 318 319 # hack! We really need to work out how to deal with annotations 320 # and features in biopython. Right now, I'll just use the 321 # generic annotations dictionary we've got to store the start 322 # and end, but we should think up something better. I don't know 323 # if I'm really a big fan of the LocatableSeq thing they've got 324 # in BioPerl, but I'm not positive what the best thing to do on 325 # this is... 326 if start: 327 new_record.annotations['start'] = start 328 if end: 329 new_record.annotations['end'] = end 330 331 # another hack to add weight information to the sequence 332 new_record.annotations['weight'] = weight 333 334 self._records.append(new_record)
335
336 - def get_column(self, col):
337 """Returns a string containing a given column. 338 339 e.g. 340 341 >>> from Bio.Alphabet import IUPAC, Gapped 342 >>> align = Alignment(Gapped(IUPAC.unambiguous_dna, "-")) 343 >>> align.add_sequence("Alpha", "ACTGCTAGCTAG") 344 >>> align.add_sequence("Beta", "ACT-CTAGCTAG") 345 >>> align.add_sequence("Gamma", "ACTGCTAGATAG") 346 >>> align.get_column(0) 347 'AAA' 348 >>> align.get_column(3) 349 'G-G' 350 """ 351 #TODO - Support negative indices? 352 col_str = '' 353 assert col >= 0 and col <= self.get_alignment_length() 354 for rec in self._records: 355 col_str += rec.seq[col] 356 return col_str
357
358 - def __getitem__(self, index):
359 """Access part of the alignment. 360 361 We'll use the following example alignment here for illustration: 362 363 >>> from Bio.Alphabet import IUPAC, Gapped 364 >>> align = Alignment(Gapped(IUPAC.unambiguous_dna, "-")) 365 >>> align.add_sequence("Alpha", "ACTGCTAGCTAG") 366 >>> align.add_sequence("Beta", "ACT-CTAGCTAG") 367 >>> align.add_sequence("Gamma", "ACTGCTAGATAG") 368 >>> align.add_sequence("Delta", "ACTGCTTGCTAG") 369 >>> align.add_sequence("Epsilon", "ACTGCTTGATAG") 370 371 You can access a row of the alignment as a SeqRecord using an integer 372 index (think of the alignment as a list of SeqRecord objects here): 373 374 >>> first_record = align[0] 375 >>> print("%s %s" % (first_record.id, first_record.seq)) 376 Alpha ACTGCTAGCTAG 377 >>> last_record = align[-1] 378 >>> print("%s %s" % (last_record.id, last_record.seq)) 379 Epsilon ACTGCTTGATAG 380 381 You can also access use python's slice notation to create a sub-alignment 382 containing only some of the SeqRecord objects: 383 384 >>> sub_alignment = align[2:5] 385 >>> print(sub_alignment) 386 Gapped(IUPACUnambiguousDNA(), '-') alignment with 3 rows and 12 columns 387 ACTGCTAGATAG Gamma 388 ACTGCTTGCTAG Delta 389 ACTGCTTGATAG Epsilon 390 391 This includes support for a step, i.e. align[start:end:step], which 392 can be used to select every second sequence: 393 394 >>> sub_alignment = align[::2] 395 >>> print(sub_alignment) 396 Gapped(IUPACUnambiguousDNA(), '-') alignment with 3 rows and 12 columns 397 ACTGCTAGCTAG Alpha 398 ACTGCTAGATAG Gamma 399 ACTGCTTGATAG Epsilon 400 401 Or to get a copy of the alignment with the rows in reverse order: 402 403 >>> rev_alignment = align[::-1] 404 >>> print(rev_alignment) 405 Gapped(IUPACUnambiguousDNA(), '-') alignment with 5 rows and 12 columns 406 ACTGCTTGATAG Epsilon 407 ACTGCTTGCTAG Delta 408 ACTGCTAGATAG Gamma 409 ACT-CTAGCTAG Beta 410 ACTGCTAGCTAG Alpha 411 412 Right now, these are the ONLY indexing operations supported. The use of 413 a second column based index is under discussion for a future update. 414 """ 415 if isinstance(index, int): 416 #e.g. result = align[x] 417 #Return a SeqRecord 418 return self._records[index] 419 elif isinstance(index, slice): 420 #e.g. sub_aling = align[i:j:k] 421 #Return a new Alignment using only the specified records. 422 #TODO - See Bug 2554 for changing the __init__ method 423 #to allow us to do this more cleanly. 424 sub_align = Alignment(self._alphabet) 425 sub_align._records = self._records[index] 426 return sub_align 427 elif len(index)==2: 428 raise TypeError("Row and Column indexing is not currently supported," 429 +"but may be in future.") 430 else: 431 raise TypeError("Invalid index type.")
432 433
434 -def _test():
435 """Run the Bio.Align.Generic module's doctests.""" 436 print("Running doctests...") 437 import doctest 438 doctest.testmod() 439 print("Done")
440 441 if __name__ == "__main__": 442 _test() 443