Package Bio :: Package Align :: Module Generic
[hide private]
[frames] | no frames]

Source Code for Module Bio.Align.Generic

  1  # Copyright 2000-2004 Brad Chapman. 
  2  # Copyright 2001 Iddo Friedberg. 
  3  # Copyright 2007-2010 by Peter Cock. 
  4  # All rights reserved. 
  5  # This code is part of the Biopython distribution and governed by its 
  6  # license.  Please see the LICENSE file that should have been included 
  7  # as part of this package. 
  8  """Classes for generic sequence alignment. 
  9   
 10  Contains classes to deal with generic sequence alignment stuff not 
 11  specific to a particular program or format. 
 12  """ 
 13  from __future__ import print_function 
 14   
 15  __docformat__ = "restructuredtext en"  # Don't just use plain text in epydoc API pages! 
 16   
 17  # biopython 
 18  from Bio.Seq import Seq 
 19  from Bio.SeqRecord import SeqRecord 
 20  from Bio import Alphabet 
 21   
 22   
23 -class Alignment(object):
24 """Represent a set of alignments (DEPRECATED). 25 26 This is a base class to represent alignments, which can be subclassed 27 to deal with an alignment in a specific format. 28 29 With the introduction of the MultipleSeqAlignment class in Bio.Align, 30 this base class is deprecated and is likely to be removed in future 31 releases of Biopython. 32 """
33 - def __init__(self, alphabet):
34 """Initialize a new Alignment object. 35 36 Arguments: 37 - alphabet - The alphabet to use for the sequence objects that are 38 created. This alphabet must be a gapped type. 39 40 e.g. 41 42 >>> from Bio.Alphabet import IUPAC, Gapped 43 >>> align = Alignment(Gapped(IUPAC.unambiguous_dna, "-")) 44 >>> align.add_sequence("Alpha", "ACTGCTAGCTAG") 45 >>> align.add_sequence("Beta", "ACT-CTAGCTAG") 46 >>> align.add_sequence("Gamma", "ACTGCTAGATAG") 47 >>> print(align) 48 Gapped(IUPACUnambiguousDNA(), '-') alignment with 3 rows and 12 columns 49 ACTGCTAGCTAG Alpha 50 ACT-CTAGCTAG Beta 51 ACTGCTAGATAG Gamma 52 """ 53 import warnings 54 import Bio 55 warnings.warn("With the introduction of the MultipleSeqAlignment class in Bio.Align, this base class is deprecated and is likely to be removed in a future release of Biopython.", Bio.BiopythonDeprecationWarning) 56 if not (isinstance(alphabet, Alphabet.Alphabet) 57 or isinstance(alphabet, Alphabet.AlphabetEncoder)): 58 raise ValueError("Invalid alphabet argument") 59 self._alphabet = alphabet 60 # hold everything at a list of SeqRecord objects 61 self._records = []
62
63 - def _str_line(self, record, length=50):
64 """Returns a truncated string representation of a SeqRecord (PRIVATE). 65 66 This is a PRIVATE function used by the __str__ method. 67 """ 68 if record.seq.__class__.__name__ == "CodonSeq": 69 if len(record.seq) <= length: 70 return "%s %s" % (record.seq, record.id) 71 else: 72 return "%s...%s %s" \ 73 % (record.seq[:length-3], record.seq[-3:], record.id) 74 else: 75 if len(record.seq) <= length: 76 return "%s %s" % (record.seq, record.id) 77 else: 78 return "%s...%s %s" \ 79 % (record.seq[:length-6], record.seq[-3:], record.id)
80
81 - def __str__(self):
82 """Returns a multi-line string summary of the alignment. 83 84 This output is intended to be readable, but large alignments are 85 shown truncated. A maximum of 20 rows (sequences) and 50 columns 86 are shown, with the record identifiers. This should fit nicely on a 87 single screen. e.g. 88 89 >>> from Bio.Alphabet import IUPAC, Gapped 90 >>> align = Alignment(Gapped(IUPAC.unambiguous_dna, "-")) 91 >>> align.add_sequence("Alpha", "ACTGCTAGCTAG") 92 >>> align.add_sequence("Beta", "ACT-CTAGCTAG") 93 >>> align.add_sequence("Gamma", "ACTGCTAGATAG") 94 >>> print(align) 95 Gapped(IUPACUnambiguousDNA(), '-') alignment with 3 rows and 12 columns 96 ACTGCTAGCTAG Alpha 97 ACT-CTAGCTAG Beta 98 ACTGCTAGATAG Gamma 99 100 See also the alignment's format method. 101 """ 102 rows = len(self._records) 103 lines = ["%s alignment with %i rows and %i columns" 104 % (str(self._alphabet), rows, self.get_alignment_length())] 105 if rows <= 20: 106 lines.extend(self._str_line(rec) for rec in self._records) 107 else: 108 lines.extend(self._str_line(rec) for rec in self._records[:18]) 109 lines.append("...") 110 lines.append(self._str_line(self._records[-1])) 111 return "\n".join(lines)
112
113 - def __repr__(self):
114 """Returns a representation of the object for debugging. 115 116 The representation cannot be used with eval() to recreate the object, 117 which is usually possible with simple python ojects. For example: 118 119 <Bio.Align.Generic.Alignment instance (2 records of length 14, 120 SingleLetterAlphabet()) at a3c184c> 121 122 The hex string is the memory address of the object, see help(id). 123 This provides a simple way to visually distinguish alignments of 124 the same size. 125 """ 126 # A doctest for __repr__ would be nice, but __class__ comes out differently 127 # if run via the __main__ trick. 128 return "<%s instance (%i records of length %i, %s) at %x>" % \ 129 (self.__class__, len(self._records), 130 self.get_alignment_length(), repr(self._alphabet), id(self))
131 # This version is useful for doing eval(repr(alignment)), 132 # but it can be VERY long: 133 # return "%s(%s, %s)" \ 134 # % (self.__class__, repr(self._records), repr(self._alphabet)) 135
136 - def format(self, format):
137 """Returns the alignment as a string in the specified file format. 138 139 The format should be a lower case string supported as an output 140 format by Bio.AlignIO (such as "fasta", "clustal", "phylip", 141 "stockholm", etc), which is used to turn the alignment into a 142 string. 143 144 e.g. 145 146 >>> from Bio.Alphabet import IUPAC, Gapped 147 >>> align = Alignment(Gapped(IUPAC.unambiguous_dna, "-")) 148 >>> align.add_sequence("Alpha", "ACTGCTAGCTAG") 149 >>> align.add_sequence("Beta", "ACT-CTAGCTAG") 150 >>> align.add_sequence("Gamma", "ACTGCTAGATAG") 151 >>> print(align.format("fasta")) 152 >Alpha 153 ACTGCTAGCTAG 154 >Beta 155 ACT-CTAGCTAG 156 >Gamma 157 ACTGCTAGATAG 158 <BLANKLINE> 159 >>> print(align.format("phylip")) 160 3 12 161 Alpha ACTGCTAGCT AG 162 Beta ACT-CTAGCT AG 163 Gamma ACTGCTAGAT AG 164 <BLANKLINE> 165 166 For Python 2.6, 3.0 or later see also the built in format() function. 167 """ 168 # See also the __format__ added for Python 2.6 / 3.0, PEP 3101 169 # See also the SeqRecord class and its format() method using Bio.SeqIO 170 return self.__format__(format)
171
172 - def __format__(self, format_spec):
173 """Returns the alignment as a string in the specified file format. 174 175 This method supports the python format() function added in 176 Python 2.6/3.0. The format_spec should be a lower case 177 string supported by Bio.AlignIO as an output file format. 178 See also the alignment's format() method.""" 179 if format_spec: 180 from Bio._py3k import StringIO 181 from Bio import AlignIO 182 handle = StringIO() 183 AlignIO.write([self], handle, format_spec) 184 return handle.getvalue() 185 else: 186 # Follow python convention and default to using __str__ 187 return str(self)
188
189 - def get_all_seqs(self):
190 """Return all of the sequences involved in the alignment (DEPRECATED). 191 192 The return value is a list of SeqRecord objects. 193 194 This method is deprecated, as the Alignment object itself now offers 195 much of the functionality of a list of SeqRecord objects (e.g. 196 iteration or slicing to create a sub-alignment). Instead use the 197 Python builtin function list, i.e. my_list = list(my_align) 198 """ 199 import warnings 200 import Bio 201 warnings.warn("This method is deprecated, since the alignment object" 202 "now acts more like a list. Instead of calling " 203 "align.get_all_seqs() you can use list(align)", 204 Bio.BiopythonDeprecationWarning) 205 return self._records
206
207 - def __iter__(self):
208 """Iterate over alignment rows as SeqRecord objects. 209 210 e.g. 211 212 >>> from Bio.Alphabet import IUPAC, Gapped 213 >>> align = Alignment(Gapped(IUPAC.unambiguous_dna, "-")) 214 >>> align.add_sequence("Alpha", "ACTGCTAGCTAG") 215 >>> align.add_sequence("Beta", "ACT-CTAGCTAG") 216 >>> align.add_sequence("Gamma", "ACTGCTAGATAG") 217 >>> for record in align: 218 ... print(record.id) 219 ... print(record.seq) 220 Alpha 221 ACTGCTAGCTAG 222 Beta 223 ACT-CTAGCTAG 224 Gamma 225 ACTGCTAGATAG 226 """ 227 return iter(self._records)
228
229 - def get_seq_by_num(self, number):
230 """Retrieve a sequence by row number (DEPRECATED). 231 232 Returns: 233 - A Seq object for the requested sequence. 234 235 Raises: 236 - IndexError - If the specified number is out of range. 237 238 NOTE: This is a legacy method. In new code where you need to access 239 the rows of the alignment (i.e. the sequences) consider iterating 240 over them or accessing them as SeqRecord objects. 241 """ 242 import warnings 243 import Bio 244 warnings.warn("This is a legacy method and is likely to be removed in a future release of Biopython. In new code where you need to access the rows of the alignment (i.e. the sequences) consider iterating over them or accessing them as SeqRecord objects.", Bio.BiopythonDeprecationWarning) 245 return self._records[number].seq
246
247 - def __len__(self):
248 """Returns the number of sequences in the alignment. 249 250 Use len(alignment) to get the number of sequences (i.e. the number of 251 rows), and alignment.get_alignment_length() to get the length of the 252 longest sequence (i.e. the number of columns). 253 254 This is easy to remember if you think of the alignment as being like a 255 list of SeqRecord objects. 256 """ 257 return len(self._records)
258
259 - def get_alignment_length(self):
260 """Return the maximum length of the alignment. 261 262 All objects in the alignment should (hopefully) have the same 263 length. This function will go through and find this length 264 by finding the maximum length of sequences in the alignment. 265 266 >>> from Bio.Alphabet import IUPAC, Gapped 267 >>> align = Alignment(Gapped(IUPAC.unambiguous_dna, "-")) 268 >>> align.add_sequence("Alpha", "ACTGCTAGCTAG") 269 >>> align.add_sequence("Beta", "ACT-CTAGCTAG") 270 >>> align.add_sequence("Gamma", "ACTGCTAGATAG") 271 >>> align.get_alignment_length() 272 12 273 274 If you want to know the number of sequences in the alignment, 275 use len(align) instead: 276 277 >>> len(align) 278 3 279 280 """ 281 max_length = 0 282 283 for record in self._records: 284 if len(record.seq) > max_length: 285 max_length = len(record.seq) 286 287 return max_length
288
289 - def add_sequence(self, descriptor, sequence, start=None, end=None, 290 weight=1.0):
291 """Add a sequence to the alignment. 292 293 This doesn't do any kind of alignment, it just adds in the sequence 294 object, which is assumed to be prealigned with the existing 295 sequences. 296 297 Arguments: 298 - descriptor - The descriptive id of the sequence being added. 299 This will be used as the resulting SeqRecord's 300 .id property (and, for historical compatibility, 301 also the .description property) 302 - sequence - A string with sequence info. 303 - start - You can explicitly set the start point of the sequence. 304 This is useful (at least) for BLAST alignments, which can 305 just be partial alignments of sequences. 306 - end - Specify the end of the sequence, which is important 307 for the same reason as the start. 308 - weight - The weight to place on the sequence in the alignment. 309 By default, all sequences have the same weight. (0.0 => 310 no weight, 1.0 => highest weight) 311 """ 312 new_seq = Seq(sequence, self._alphabet) 313 314 # We are now effectively using the SeqRecord's .id as 315 # the primary identifier (e.g. in Bio.SeqIO) so we should 316 # populate it with the descriptor. 317 # For backwards compatibility, also store this in the 318 # SeqRecord's description property. 319 new_record = SeqRecord(new_seq, 320 id=descriptor, 321 description=descriptor) 322 323 # hack! We really need to work out how to deal with annotations 324 # and features in biopython. Right now, I'll just use the 325 # generic annotations dictionary we've got to store the start 326 # and end, but we should think up something better. I don't know 327 # if I'm really a big fan of the LocatableSeq thing they've got 328 # in BioPerl, but I'm not positive what the best thing to do on 329 # this is... 330 if start: 331 new_record.annotations['start'] = start 332 if end: 333 new_record.annotations['end'] = end 334 335 # another hack to add weight information to the sequence 336 new_record.annotations['weight'] = weight 337 338 self._records.append(new_record)
339
340 - def get_column(self, col):
341 """Returns a string containing a given column. 342 343 e.g. 344 345 >>> from Bio.Alphabet import IUPAC, Gapped 346 >>> align = Alignment(Gapped(IUPAC.unambiguous_dna, "-")) 347 >>> align.add_sequence("Alpha", "ACTGCTAGCTAG") 348 >>> align.add_sequence("Beta", "ACT-CTAGCTAG") 349 >>> align.add_sequence("Gamma", "ACTGCTAGATAG") 350 >>> align.get_column(0) 351 'AAA' 352 >>> align.get_column(3) 353 'G-G' 354 """ 355 # TODO - Support negative indices? 356 col_str = '' 357 assert col >= 0 and col <= self.get_alignment_length() 358 for rec in self._records: 359 col_str += rec.seq[col] 360 return col_str
361
362 - def __getitem__(self, index):
363 """Access part of the alignment. 364 365 We'll use the following example alignment here for illustration: 366 367 >>> from Bio.Alphabet import IUPAC, Gapped 368 >>> align = Alignment(Gapped(IUPAC.unambiguous_dna, "-")) 369 >>> align.add_sequence("Alpha", "ACTGCTAGCTAG") 370 >>> align.add_sequence("Beta", "ACT-CTAGCTAG") 371 >>> align.add_sequence("Gamma", "ACTGCTAGATAG") 372 >>> align.add_sequence("Delta", "ACTGCTTGCTAG") 373 >>> align.add_sequence("Epsilon", "ACTGCTTGATAG") 374 375 You can access a row of the alignment as a SeqRecord using an integer 376 index (think of the alignment as a list of SeqRecord objects here): 377 378 >>> first_record = align[0] 379 >>> print("%s %s" % (first_record.id, first_record.seq)) 380 Alpha ACTGCTAGCTAG 381 >>> last_record = align[-1] 382 >>> print("%s %s" % (last_record.id, last_record.seq)) 383 Epsilon ACTGCTTGATAG 384 385 You can also access use python's slice notation to create a sub-alignment 386 containing only some of the SeqRecord objects: 387 388 >>> sub_alignment = align[2:5] 389 >>> print(sub_alignment) 390 Gapped(IUPACUnambiguousDNA(), '-') alignment with 3 rows and 12 columns 391 ACTGCTAGATAG Gamma 392 ACTGCTTGCTAG Delta 393 ACTGCTTGATAG Epsilon 394 395 This includes support for a step, i.e. align[start:end:step], which 396 can be used to select every second sequence: 397 398 >>> sub_alignment = align[::2] 399 >>> print(sub_alignment) 400 Gapped(IUPACUnambiguousDNA(), '-') alignment with 3 rows and 12 columns 401 ACTGCTAGCTAG Alpha 402 ACTGCTAGATAG Gamma 403 ACTGCTTGATAG Epsilon 404 405 Or to get a copy of the alignment with the rows in reverse order: 406 407 >>> rev_alignment = align[::-1] 408 >>> print(rev_alignment) 409 Gapped(IUPACUnambiguousDNA(), '-') alignment with 5 rows and 12 columns 410 ACTGCTTGATAG Epsilon 411 ACTGCTTGCTAG Delta 412 ACTGCTAGATAG Gamma 413 ACT-CTAGCTAG Beta 414 ACTGCTAGCTAG Alpha 415 416 Right now, these are the ONLY indexing operations supported. The use of 417 a second column based index is under discussion for a future update. 418 """ 419 if isinstance(index, int): 420 # e.g. result = align[x] 421 # Return a SeqRecord 422 return self._records[index] 423 elif isinstance(index, slice): 424 # e.g. sub_aling = align[i:j:k] 425 # Return a new Alignment using only the specified records. 426 # TODO - See Bug 2554 for changing the __init__ method 427 # to allow us to do this more cleanly. 428 sub_align = Alignment(self._alphabet) 429 sub_align._records = self._records[index] 430 return sub_align 431 elif len(index)==2: 432 raise TypeError("Row and Column indexing is not currently supported," 433 +"but may be in future.") 434 else: 435 raise TypeError("Invalid index type.")
436 437
438 -def _test():
439 """Run the Bio.Align.Generic module's doctests.""" 440 print("Running doctests...") 441 import doctest 442 doctest.testmod() 443 print("Done")
444 445 if __name__ == "__main__": 446 _test() 447