Package Bio :: Package Restriction :: Module Restriction
[hide private]
[frames] | no frames]

Source Code for Module Bio.Restriction.Restriction

   1  #!/usr/bin/env python 
   2  # 
   3  #      Restriction Analysis Libraries. 
   4  #      Copyright (C) 2004. Frederic Sohm. 
   5  # 
   6  # This code is part of the Biopython distribution and governed by its 
   7  # license.  Please see the LICENSE file that should have been included 
   8  # as part of this package. 
   9  # 
  10   
  11  """Restriction Enzyme classes. 
  12   
  13  Notes about the diverses class of the restriction enzyme implementation:: 
  14   
  15              RestrictionType is the type of all restriction enzymes. 
  16          ---------------------------------------------------------------------------- 
  17              AbstractCut implements some methods that are common to all enzymes. 
  18          ---------------------------------------------------------------------------- 
  19              NoCut, OneCut,TwoCuts   represent the number of double strand cuts 
  20                                      produced by the enzyme. 
  21                                      they correspond to the 4th field of the rebase 
  22                                      record emboss_e.NNN. 
  23                      0->NoCut    : the enzyme is not characterised. 
  24                      2->OneCut   : the enzyme produce one double strand cut. 
  25                      4->TwoCuts  : two double strand cuts. 
  26          ---------------------------------------------------------------------------- 
  27              Meth_Dep, Meth_Undep    represent the methylation susceptibility to 
  28                                      the enzyme. 
  29                                      Not implemented yet. 
  30          ---------------------------------------------------------------------------- 
  31              Palindromic,            if the site is palindromic or not. 
  32              NotPalindromic          allow some optimisations of the code. 
  33                                      No need to check the reverse strand 
  34                                      with palindromic sites. 
  35          ---------------------------------------------------------------------------- 
  36              Unknown, Blunt,         represent the overhang. 
  37              Ov5, Ov3                Unknown is here for symetry reasons and 
  38                                      correspond to enzymes that are not characterised 
  39                                      in rebase. 
  40          ---------------------------------------------------------------------------- 
  41              Defined, Ambiguous,     represent the sequence of the overhang. 
  42              NotDefined 
  43                                      NotDefined is for enzymes not characterised in 
  44                                      rebase. 
  45   
  46                                      Defined correspond to enzymes that display a 
  47                                      constant overhang whatever the sequence. 
  48                                      ex : EcoRI. G^AATTC -> overhang :AATT 
  49                                                  CTTAA^G 
  50   
  51                                      Ambiguous : the overhang varies with the 
  52                                      sequence restricted. 
  53                                      Typically enzymes which cut outside their 
  54                                      restriction site or (but not always) 
  55                                      inside an ambiguous site. 
  56                                      ex: 
  57                                      AcuI CTGAAG(22/20)  -> overhang : NN 
  58                                      AasI GACNNN^NNNGTC  -> overhang : NN 
  59                                           CTGN^NNNNNCAG 
  60   
  61                  note : these 3 classes refers to the overhang not the site. 
  62                     So the enzyme ApoI (RAATTY) is defined even if its restriction 
  63                     site is ambiguous. 
  64   
  65                          ApoI R^AATTY -> overhang : AATT -> Defined 
  66                               YTTAA^R 
  67                     Accordingly, blunt enzymes are always Defined even 
  68                     when they cut outside their restriction site. 
  69          ---------------------------------------------------------------------------- 
  70              Not_available,          as found in rebase file emboss_r.NNN files. 
  71              Commercially_available 
  72                                      allow the selection of the enzymes according to 
  73                                      their suppliers to reduce the quantity 
  74                                      of results. 
  75                                      Also will allow the implementation of buffer 
  76                                      compatibility tables. Not implemented yet. 
  77   
  78                                      the list of suppliers is extracted from 
  79                                      emboss_s.NNN 
  80          ---------------------------------------------------------------------------- 
  81  """ 
  82   
  83  from __future__ import print_function 
  84  from Bio._py3k import zip 
  85  from Bio._py3k import filter 
  86  from Bio._py3k import range 
  87   
  88  import re 
  89  import itertools 
  90   
  91  from Bio.Seq import Seq, MutableSeq 
  92  from Bio.Alphabet import IUPAC 
  93   
  94  from Bio.Restriction.Restriction_Dictionary import rest_dict as enzymedict 
  95  from Bio.Restriction.Restriction_Dictionary import typedict 
  96  from Bio.Restriction.Restriction_Dictionary import suppliers as suppliers_dict 
  97  # TODO: Consider removing this wildcard import. 
  98  from Bio.Restriction.RanaConfig import * 
  99  from Bio.Restriction.PrintFormat import PrintFormat 
 100   
 101  __docformat__ = "restructuredtext en" 
102 103 # Used to use Bio.Restriction.DNAUtils.check_bases (and expose it under this 104 # namespace), but have deprecated that module. 105 -def _check_bases(seq_string):
106 """Check characters in a string (PRIVATE). 107 108 Remove digits and white space present in string. Allows any valid ambiguous 109 IUPAC DNA single letters codes (ABCDGHKMNRSTVWY, lower case are converted). 110 111 Other characters (e.g. symbols) trigger a TypeError. 112 113 Returns the string WITH A LEADING SPACE (!). This is for backwards 114 compatibility, and may in part be explained by the fact that 115 Bio.Restriction doesn't use zero based counting. 116 """ 117 # Remove white space and make upper case: 118 seq_string = "".join(seq_string.split()).upper() 119 # Remove digits 120 for c in "0123456789": 121 seq_string = seq_string.replace(c, "") 122 # Check only allowed IUPAC letters 123 if not set(seq_string).issubset(set("ABCDGHKMNRSTVWY")): 124 raise TypeError("Invalid character found in %s" % repr(seq_string)) 125 return " " + seq_string
126 127 128 matching = {'A': 'ARWMHVDN', 'C': 'CYSMHBVN', 'G': 'GRSKBVDN', 129 'T': 'TYWKHBDN', 'R': 'ABDGHKMNSRWV', 'Y': 'CBDHKMNSTWVY', 130 'W': 'ABDHKMNRTWVY', 'S': 'CBDGHKMNSRVY', 'M': 'ACBDHMNSRWVY', 131 'K': 'BDGHKNSRTWVY', 'H': 'ACBDHKMNSRTWVY', 132 'B': 'CBDGHKMNSRTWVY', 'V': 'ACBDGHKMNSRWVY', 133 'D': 'ABDGHKMNSRTWVY', 'N': 'ACBDGHKMNSRTWVY'} 134 135 DNA = Seq
136 137 138 -class FormattedSeq(object):
139 """FormattedSeq(seq, [linear=True])-> new FormattedSeq. 140 141 Translate a Bio.Seq into a formatted sequence to be used with Restriction. 142 143 Roughly: 144 remove anything which is not IUPAC alphabet and then add a space 145 in front of the sequence to get a biological index instead of a 146 python index (i.e. index of the first base is 1 not 0). 147 148 Retains information about the shape of the molecule linear (default) 149 or circular. Restriction sites are search over the edges of circular 150 sequence.""" 151
152 - def __init__(self, seq, linear=True):
153 """FormattedSeq(seq, [linear=True])-> new FormattedSeq. 154 155 seq is either a Bio.Seq, Bio.MutableSeq or a FormattedSeq. 156 if seq is a FormattedSeq, linear will have no effect on the 157 shape of the sequence.""" 158 if isinstance(seq, Seq) or isinstance(seq, MutableSeq): 159 stringy = str(seq) 160 self.lower = stringy.islower() 161 # Note this adds a leading space to the sequence (!) 162 self.data = _check_bases(stringy) 163 self.linear = linear 164 self.klass = seq.__class__ 165 self.alphabet = seq.alphabet 166 elif isinstance(seq, FormattedSeq): 167 self.lower = seq.lower 168 self.data = seq.data 169 self.linear = seq.linear 170 self.alphabet = seq.alphabet 171 self.klass = seq.klass 172 else: 173 raise TypeError('expected Seq or MutableSeq, got %s' % type(seq))
174
175 - def __len__(self):
176 return len(self.data) - 1
177
178 - def __repr__(self):
179 return 'FormattedSeq(%s, linear=%s)' %(repr(self[1:]), repr(self.linear))
180
181 - def __eq__(self, other):
182 if isinstance(other, FormattedSeq): 183 if repr(self) == repr(other): 184 return True 185 else: 186 return False 187 return False
188
189 - def circularise(self):
190 """FS.circularise() -> circularise FS""" 191 self.linear = False 192 return
193
194 - def linearise(self):
195 """FS.linearise() -> linearise FS""" 196 self.linear = True 197 return
198
199 - def to_linear(self):
200 """FS.to_linear() -> new linear FS instance""" 201 new = self.__class__(self) 202 new.linear = True 203 return new
204
205 - def to_circular(self):
206 """FS.to_circular() -> new circular FS instance""" 207 new = self.__class__(self) 208 new.linear = False 209 return new
210
211 - def is_linear(self):
212 """FS.is_linear() -> bool. 213 214 True if the sequence will analysed as a linear sequence.""" 215 return self.linear
216
217 - def finditer(self, pattern, size):
218 """FS.finditer(pattern, size) -> list. 219 220 return a list of pattern into the sequence. 221 the list is made of tuple (location, pattern.group). 222 the latter is used with non palindromic sites. 223 pattern is the regular expression pattern corresponding to the 224 enzyme restriction site. 225 size is the size of the restriction enzyme recognition-site size.""" 226 if self.is_linear(): 227 data = self.data 228 else: 229 data = self.data + self.data[1:size] 230 return [(i.start(), i.group) for i in re.finditer(pattern, data)]
231
232 - def __getitem__(self, i):
233 if self.lower: 234 return self.klass((self.data[i]).lower(), self.alphabet) 235 return self.klass(self.data[i], self.alphabet)
236
237 238 -class RestrictionType(type):
239 """RestrictionType. Type from which derives all enzyme classes. 240 241 Implement the operator methods.""" 242
243 - def __init__(cls, name='', bases=(), dct={}):
244 """RE(name, bases, dct) -> RestrictionType instance. 245 246 Not intended to be used in normal operation. The enzymes are 247 instantiated when importing the module. 248 249 see below.""" 250 if "-" in name: 251 raise ValueError("Problem with hyphen in %s as enzyme name" 252 % repr(name)) 253 # 2011/11/26 - Nobody knows what this call was supposed to accomplish, 254 # but all unit tests seem to pass without it. 255 # super(RestrictionType, cls).__init__(cls, name, bases, dct) 256 try: 257 cls.compsite = re.compile(cls.compsite) 258 except Exception as err: 259 raise ValueError("Problem with regular expression, re.compiled(%s)" 260 % repr(cls.compsite))
261
262 - def __add__(cls, other):
263 """RE.__add__(other) -> RestrictionBatch(). 264 265 if other is an enzyme returns a batch of the two enzymes. 266 if other is already a RestrictionBatch add enzyme to it.""" 267 if isinstance(other, RestrictionType): 268 return RestrictionBatch([cls, other]) 269 elif isinstance(other, RestrictionBatch): 270 return other.add_nocheck(cls) 271 else: 272 raise TypeError
273
274 - def __div__(cls, other):
275 """RE.__div__(other) -> list. 276 277 RE/other 278 returns RE.search(other).""" 279 return cls.search(other)
280
281 - def __rdiv__(cls, other):
282 """RE.__rdiv__(other) -> list. 283 284 other/RE 285 returns RE.search(other).""" 286 return cls.search(other)
287
288 - def __truediv__(cls, other):
289 """RE.__truediv__(other) -> list. 290 291 RE/other 292 returns RE.search(other).""" 293 return cls.search(other)
294
295 - def __rtruediv__(cls, other):
296 """RE.__rtruediv__(other) -> list. 297 298 other/RE 299 returns RE.search(other).""" 300 return cls.search(other)
301
302 - def __floordiv__(cls, other):
303 """RE.__floordiv__(other) -> list. 304 305 RE//other 306 returns RE.catalyse(other).""" 307 return cls.catalyse(other)
308
309 - def __rfloordiv__(cls, other):
310 """RE.__rfloordiv__(other) -> list. 311 312 other//RE 313 returns RE.catalyse(other).""" 314 return cls.catalyse(other)
315
316 - def __str__(cls):
317 """RE.__str__() -> str. 318 319 return the name of the enzyme.""" 320 return cls.__name__
321
322 - def __repr__(cls):
323 """RE.__repr__() -> str. 324 325 used with eval or exec will instantiate the enzyme.""" 326 return "%s" % cls.__name__
327
328 - def __len__(cls):
329 """RE.__len__() -> int. 330 331 length of the recognition site.""" 332 return cls.size
333
334 - def __hash__(cls):
335 # Python default is to use id(...) 336 # This is consistent with the __eq__ implementation 337 return id(cls)
338
339 - def __eq__(cls, other):
340 """RE == other -> bool 341 342 True if RE and other are the same enzyme. 343 344 Specifically this checks they are the same Python object. 345 """ 346 # assert (id(cls)==id(other)) == (other is cls) == (cls is other) 347 return id(cls)==id(other)
348
349 - def __ne__(cls, other):
350 """RE != other -> bool. 351 isoschizomer strict, same recognition site, same restriction -> False 352 all the other-> True 353 354 WARNING - This is not the inverse of the __eq__ method. 355 """ 356 if not isinstance(other, RestrictionType): 357 return True 358 elif cls.charac == other.charac: 359 return False 360 else: 361 return True
362
363 - def __rshift__(cls, other):
364 """RE >> other -> bool. 365 366 neoschizomer : same recognition site, different restriction. -> True 367 all the others : -> False""" 368 if not isinstance(other, RestrictionType): 369 return False 370 elif cls.site == other.site and cls.charac != other.charac: 371 return True 372 else: 373 return False
374
375 - def __mod__(cls, other):
376 """a % b -> bool. 377 378 Test compatibility of the overhang of a and b. 379 True if a and b have compatible overhang.""" 380 if not isinstance(other, RestrictionType): 381 raise TypeError( 382 'expected RestrictionType, got %s instead' % type(other)) 383 return cls._mod1(other)
384
385 - def __ge__(cls, other):
386 """a >= b -> bool. 387 388 a is greater or equal than b if the a site is longer than b site. 389 if their site have the same length sort by alphabetical order of their 390 names.""" 391 if not isinstance(other, RestrictionType): 392 raise NotImplementedError 393 if len(cls) > len(other): 394 return True 395 elif cls.size == len(other) and cls.__name__ >= other.__name__: 396 return True 397 else: 398 return False
399
400 - def __gt__(cls, other):
401 """a > b -> bool. 402 403 sorting order: 404 1. size of the recognition site. 405 2. if equal size, alphabetical order of the names.""" 406 if not isinstance(other, RestrictionType): 407 raise NotImplementedError 408 if len(cls) > len(other): 409 return True 410 elif cls.size == len(other) and cls.__name__ > other.__name__: 411 return True 412 else: 413 return False
414
415 - def __le__(cls, other):
416 """a <= b -> bool. 417 418 sorting order: 419 1. size of the recognition site. 420 2. if equal size, alphabetical order of the names.""" 421 if not isinstance(other, RestrictionType): 422 raise NotImplementedError 423 elif len(cls) < len(other): 424 return True 425 elif len(cls) == len(other) and cls.__name__ <= other.__name__: 426 return True 427 else: 428 return False
429
430 - def __lt__(cls, other):
431 """a < b -> bool. 432 433 sorting order: 434 1. size of the recognition site. 435 2. if equal size, alphabetical order of the names.""" 436 if not isinstance(other, RestrictionType): 437 raise NotImplementedError 438 elif len(cls) < len(other): 439 return True 440 elif len(cls) == len(other) and cls.__name__ < other.__name__: 441 return True 442 else: 443 return False
444
445 446 -class AbstractCut(RestrictionType):
447 """Implement the methods that are common to all restriction enzymes. 448 449 All the methods are classmethod. 450 451 For internal use only. Not meant to be instantiate.""" 452 453 @classmethod
454 - def search(cls, dna, linear=True):
455 """RE.search(dna, linear=True) -> list. 456 457 return a list of all the site of RE in dna. Compensate for circular 458 sequences and so on. 459 460 dna must be a Bio.Seq.Seq instance or a Bio.Seq.MutableSeq instance. 461 462 if linear is False, the restriction sites than span over the boundaries 463 will be included. 464 465 The positions are the first base of the 3' fragment, 466 i.e. the first base after the position the enzyme will cut. """ 467 # 468 # Separating search from _search allow a (very limited) optimisation 469 # of the search when using a batch of restriction enzymes. 470 # in this case the DNA is tested once by the class which implements 471 # the batch instead of being tested by each enzyme single. 472 # see RestrictionBatch.search() for example. 473 # 474 if isinstance(dna, FormattedSeq): 475 cls.dna = dna 476 return cls._search() 477 else: 478 cls.