Package Bio :: Package Restriction :: Module Restriction
[hide private]
[frames] | no frames]

Source Code for Module Bio.Restriction.Restriction

   1  #!/usr/bin/env python 
   2  # 
   3  #      Restriction Analysis Libraries. 
   4  #      Copyright (C) 2004. Frederic Sohm. 
   5  # 
   6  # This code is part of the Biopython distribution and governed by its 
   7  # license.  Please see the LICENSE file that should have been included 
   8  # as part of this package. 
   9  # 
  10   
  11  """Restriction Enzyme classes. 
  12   
  13  Notes about the diverses class of the restriction enzyme implementation:: 
  14   
  15              RestrictionType is the type of all restriction enzymes. 
  16          ---------------------------------------------------------------------------- 
  17              AbstractCut implements some methods that are common to all enzymes. 
  18          ---------------------------------------------------------------------------- 
  19              NoCut, OneCut,TwoCuts   represent the number of double strand cuts 
  20                                      produced by the enzyme. 
  21                                      they correspond to the 4th field of the 
  22                                      rebase record emboss_e.NNN. 
  23                      0->NoCut    : the enzyme is not characterised. 
  24                      2->OneCut   : the enzyme produce one double strand cut. 
  25                      4->TwoCuts  : two double strand cuts. 
  26          ---------------------------------------------------------------------------- 
  27              Meth_Dep, Meth_Undep    represent the methylation susceptibility to 
  28                                      the enzyme. 
  29                                      Not implemented yet. 
  30          ---------------------------------------------------------------------------- 
  31              Palindromic,            if the site is palindromic or not. 
  32              NotPalindromic          allow some optimisations of the code. 
  33                                      No need to check the reverse strand 
  34                                      with palindromic sites. 
  35          ---------------------------------------------------------------------------- 
  36              Unknown, Blunt,         represent the overhang. 
  37              Ov5, Ov3                Unknown is here for symmetry reasons and 
  38                                      correspond to enzymes that are not 
  39                                      characterised in rebase. 
  40          ---------------------------------------------------------------------------- 
  41              Defined, Ambiguous,     represent the sequence of the overhang. 
  42              NotDefined 
  43                                      NotDefined is for enzymes not characterised 
  44                                      in rebase. 
  45   
  46                                      Defined correspond to enzymes that display 
  47                                      a constant overhang whatever the sequence. 
  48                                      ex : EcoRI. G^AATTC -> overhang :AATT 
  49                                                  CTTAA^G 
  50   
  51                                      Ambiguous : the overhang varies with the 
  52                                      sequence restricted. 
  53                                      Typically enzymes which cut outside their 
  54                                      restriction site or (but not always) 
  55                                      inside an ambiguous site. 
  56                                      ex: 
  57                                      AcuI CTGAAG(22/20)  -> overhang : NN 
  58                                      AasI GACNNN^NNNGTC  -> overhang : NN 
  59                                           CTGN^NNNNNCAG 
  60   
  61                  note : these 3 classes refers to the overhang not the site. 
  62                     So the enzyme ApoI (RAATTY) is defined even if its 
  63                     restriction site is ambiguous. 
  64   
  65                          ApoI R^AATTY -> overhang : AATT -> Defined 
  66                               YTTAA^R 
  67                     Accordingly, blunt enzymes are always Defined even 
  68                     when they cut outside their restriction site. 
  69          ---------------------------------------------------------------------------- 
  70              Not_available,          as found in rebase file emboss_r.NNN files. 
  71              Commercially_available 
  72                                      allow the selection of the enzymes 
  73                                      according to their suppliers to reduce the 
  74                                      quantity of results. 
  75                                      Also will allow the implementation of 
  76                                      buffer compatibility tables. Not 
  77                                      implemented yet. 
  78   
  79                                      the list of suppliers is extracted from 
  80                                      emboss_s.NNN 
  81          ---------------------------------------------------------------------------- 
  82  """ 
  83   
  84  from __future__ import print_function 
  85  from Bio._py3k import zip 
  86  from Bio._py3k import filter 
  87  from Bio._py3k import range 
  88   
  89  import re 
  90  import itertools 
  91   
  92  from Bio.Seq import Seq, MutableSeq 
  93  from Bio.Alphabet import IUPAC 
  94   
  95  from Bio.Restriction.Restriction_Dictionary import rest_dict as enzymedict 
  96  from Bio.Restriction.Restriction_Dictionary import typedict 
  97  from Bio.Restriction.Restriction_Dictionary import suppliers as suppliers_dict 
  98  # TODO: Consider removing this wildcard import. 
  99  from Bio.Restriction.RanaConfig import * 
 100  from Bio.Restriction.PrintFormat import PrintFormat 
101 102 103 # Used to use Bio.Restriction.DNAUtils.check_bases (and expose it under this 104 # namespace), but have deprecated that module. 105 106 107 -def _check_bases(seq_string):
108 """Check characters in a string (PRIVATE). 109 110 Remove digits and white space present in string. Allows any valid ambiguous 111 IUPAC DNA single letters codes (ABCDGHKMNRSTVWY, lower case are converted). 112 113 Other characters (e.g. symbols) trigger a TypeError. 114 115 Returns the string WITH A LEADING SPACE (!). This is for backwards 116 compatibility, and may in part be explained by the fact that 117 Bio.Restriction doesn't use zero based counting. 118 """ 119 # Remove white space and make upper case: 120 seq_string = "".join(seq_string.split()).upper() 121 # Remove digits 122 for c in "0123456789": 123 seq_string = seq_string.replace(c, "") 124 # Check only allowed IUPAC letters 125 if not set(seq_string).issubset(set("ABCDGHKMNRSTVWY")): 126 raise TypeError("Invalid character found in %s" % repr(seq_string)) 127 return " " + seq_string
128 129 130 matching = {'A': 'ARWMHVDN', 'C': 'CYSMHBVN', 'G': 'GRSKBVDN', 131 'T': 'TYWKHBDN', 'R': 'ABDGHKMNSRWV', 'Y': 'CBDHKMNSTWVY', 132 'W': 'ABDHKMNRTWVY', 'S': 'CBDGHKMNSRVY', 'M': 'ACBDHMNSRWVY', 133 'K': 'BDGHKNSRTWVY', 'H': 'ACBDHKMNSRTWVY', 134 'B': 'CBDGHKMNSRTWVY', 'V': 'ACBDGHKMNSRWVY', 135 'D': 'ABDGHKMNSRTWVY', 'N': 'ACBDGHKMNSRTWVY'} 136 137 DNA = Seq
138 139 140 -class FormattedSeq(object):
141 """FormattedSeq(seq, [linear=True])-> new FormattedSeq. 142 143 Translate a Bio.Seq into a formatted sequence to be used with Restriction. 144 145 Roughly: 146 remove anything which is not IUPAC alphabet and then add a space 147 in front of the sequence to get a biological index instead of a 148 python index (i.e. index of the first base is 1 not 0). 149 150 Retains information about the shape of the molecule linear (default) 151 or circular. Restriction sites are search over the edges of circular 152 sequence. 153 """ 154
155 - def __init__(self, seq, linear=True):
156 """FormattedSeq(seq, [linear=True])-> new FormattedSeq. 157 158 seq is either a Bio.Seq, Bio.MutableSeq or a FormattedSeq. 159 if seq is a FormattedSeq, linear will have no effect on the 160 shape of the sequence. 161 """ 162 if isinstance(seq, (Seq, MutableSeq)): 163 stringy = str(seq) 164 self.lower = stringy.islower() 165 # Note this adds a leading space to the sequence (!) 166 self.data = _check_bases(stringy) 167 self.linear = linear 168 self.klass = seq.__class__ 169 self.alphabet = seq.alphabet 170 elif isinstance(seq, FormattedSeq): 171 self.lower = seq.lower 172 self.data = seq.data 173 self.linear = seq.linear 174 self.alphabet = seq.alphabet 175 self.klass = seq.klass 176 else: 177 raise TypeError('expected Seq or MutableSeq, got %s' % type(seq))
178
179 - def __len__(self):
180 return len(self.data) - 1
181
182 - def __repr__(self):
183 return 'FormattedSeq(%s, linear=%s)' % (repr(self[1:]), 184 repr(self.linear))
185
186 - def __eq__(self, other):
187 if isinstance(other, FormattedSeq): 188 if repr(self) == repr(other): 189 return True 190 else: 191 return False 192 return False
193
194 - def circularise(self):
195 """FS.circularise() -> circularise FS""" 196 self.linear = False 197 return
198
199 - def linearise(self):
200 """FS.linearise() -> linearise FS""" 201 self.linear = True 202 return
203
204 - def to_linear(self):
205 """FS.to_linear() -> new linear FS instance""" 206 new = self.__class__(self) 207 new.linear = True 208 return new
209
210 - def to_circular(self):
211 """FS.to_circular() -> new circular FS instance""" 212 new = self.__class__(self) 213 new.linear = False 214 return new
215
216 - def is_linear(self):
217 """FS.is_linear() -> bool. 218 219 True if the sequence will analysed as a linear sequence.""" 220 return self.linear
221
222 - def finditer(self, pattern, size):
223 """FS.finditer(pattern, size) -> list. 224 225 return a list of pattern into the sequence. 226 the list is made of tuple (location, pattern.group). 227 the latter is used with non palindromic sites. 228 pattern is the regular expression pattern corresponding to the 229 enzyme restriction site. 230 size is the size of the restriction enzyme recognition-site size. 231 """ 232 if self.is_linear(): 233 data = self.data 234 else: 235 data = self.data + self.data[1:size] 236 return [(i.start(), i.group) for i in re.finditer(pattern, data)]
237
238 - def __getitem__(self, i):
239 if self.lower: 240 return self.klass((self.data[i]).lower(), self.alphabet) 241 return self.klass(self.data[i], self.alphabet)
242
243 244 -class RestrictionType(type):
245 """RestrictionType. Type from which derives all enzyme classes. 246 247 Implement the operator methods. 248 """ 249
250 - def __init__(cls, name='', bases=(), dct=None):
251 """RE(name, bases, dct) -> RestrictionType instance. 252 253 Not intended to be used in normal operation. The enzymes are 254 instantiated when importing the module. 255 256 see below.""" 257 if "-" in name: 258 raise ValueError("Problem with hyphen in %s as enzyme name" 259 % repr(name)) 260 # 2011/11/26 - Nobody knows what this call was supposed to accomplish, 261 # but all unit tests seem to pass without it. 262 # super(RestrictionType, cls).__init__(cls, name, bases, dct) 263 try: 264 cls.compsite = re.compile(cls.compsite) 265 except Exception as err: 266 raise ValueError("Problem with regular expression, re.compiled(%s)" 267 % repr(cls.compsite))
268
269 - def __add__(cls, other):
270 """RE.__add__(other) -> RestrictionBatch(). 271 272 if other is an enzyme returns a batch of the two enzymes. 273 if other is already a RestrictionBatch add enzyme to it. 274 """ 275 if isinstance(other, RestrictionType): 276 return RestrictionBatch([cls, other]) 277 elif isinstance(other, RestrictionBatch): 278 return other.add_nocheck(cls) 279 else: 280 raise TypeError
281
282 - def __div__(cls, other):
283 """RE.__div__(other) -> list. 284 285 RE/other 286 returns RE.search(other).""" 287 return cls.search(other)
288
289 - def __rdiv__(cls, other):
290 """RE.__rdiv__(other) -> list. 291 292 other/RE 293 returns RE.search(other).""" 294 return cls.search(other)
295
296 - def __truediv__(cls, other):
297 """RE.__truediv__(other) -> list. 298 299 RE/other 300 returns RE.search(other).""" 301 return cls.search(other)
302
303 - def __rtruediv__(cls, other):
304 """RE.__rtruediv__(other) -> list. 305 306 other/RE 307 returns RE.search(other).""" 308 return cls.search(other)
309
310 - def __floordiv__(cls, other):
311 """RE.__floordiv__(other) -> list. 312 313 RE//other 314 returns RE.catalyse(other).""" 315 return cls.catalyse(other)
316
317 - def __rfloordiv__(cls, other):
318 """RE.__rfloordiv__(other) -> list. 319 320 other//RE 321 returns RE.catalyse(other).""" 322 return cls.catalyse(other)
323
324 - def __str__(cls):
325 """RE.__str__() -> str. 326 327 return the name of the enzyme.""" 328 return cls.__name__
329
330 - def __repr__(cls):
331 """RE.__repr__() -> str. 332 333 used with eval or exec will instantiate the enzyme.""" 334 return "%s" % cls.__name__
335
336 - def __len__(cls):
337 """RE.__len__() -> int. 338 339 length of the recognition site.""" 340 return cls.size
341
342 - def __hash__(cls):
343 # Python default is to use id(...) 344 # This is consistent with the __eq__ implementation 345 return id(cls)
346
347 - def __eq__(cls, other):
348 """RE == other -> bool 349 350 True if RE and other are the same enzyme. 351 352 Specifically this checks they are the same Python object. 353 """ 354 # assert (id(cls)==id(other)) == (other is cls) == (cls is other) 355 return id(cls) == id(other)
356
357 - def __ne__(cls, other):
358 """RE != other -> bool. 359 isoschizomer strict, same recognition site, same restriction -> False 360 all the other-> True 361 362 WARNING - This is not the inverse of the __eq__ method. 363 """ 364 if not isinstance(other, RestrictionType): 365 return True 366 elif cls.charac == other.charac: 367 return False 368 else: 369 return True
370
371 - def __rshift__(cls, other):
372 """RE >> other -> bool. 373 374 neoschizomer : same recognition site, different restriction. -> True 375 all the others : -> False 376 """ 377 if not isinstance(other, RestrictionType): 378 return False 379 elif cls.site == other.site and cls.charac != other.charac: 380 return True 381 else: 382 return False
383
384 - def __mod__(cls, other):
385 """a % b -> bool. 386 387 Test compatibility of the overhang of a and b. 388 True if a and b have compatible overhang. 389 """ 390 if not isinstance(other, RestrictionType): 391 raise TypeError( 392 'expected RestrictionType, got %s instead' % type(other)) 393 return cls._mod1(other)
394
395 - def __ge__(cls, other):
396 """a >= b -> bool. 397 398 a is greater or equal than b if the a site is longer than b site. 399 if their site have the same length sort by alphabetical order of their 400 names.""" 401 if not isinstance(other, RestrictionType): 402 raise NotImplementedError 403 if len(cls) > len(other): 404 return True 405 elif cls.size == len(other) and cls.__name__ >= other.__name__: 406 return True 407 else: 408 return False
409
410 - def __gt__(cls, other):
411 """a > b -> bool. 412 413 sorting order: 414 1. size of the recognition site. 415 2. if equal size, alphabetical order of the names.""" 416 if not isinstance(other, RestrictionType): 417 raise NotImplementedError 418 if len(cls) > len(other): 419 return True 420 elif cls.size == len(other) and cls.__name__ > other.__name__: 421 return True 422 else: 423 return False
424
425 - def __le__(cls, other):
426 """a <= b -> bool. 427 428 sorting order: 429 1. size of the recognition site. 430 2. if equal size, alphabetical order of the names. 431 """ 432 if not isinstance(other, RestrictionType): 433 raise NotImplementedError 434 elif len(cls) < len(other): 435 return True 436 elif len(cls) == len(other) and cls.__name__ <= other.__name__: 437 return True 438 else: 439 return False
440
441 - def __lt__(cls, other):
442 """a < b -> bool. 443 444 sorting order: 445 1. size of the recognition site. 446 2. if equal size, alphabetical order of the names. 447 """ 448 if not isinstance(other, RestrictionType): 449 raise NotImplementedError 450 elif len(cls) < len(other): 451 return True 452 elif len(cls) == len(other) and cls.__name__ < other.__name__: 453 return True 454 else: 455 return False
456
457 458 -class AbstractCut(RestrictionType):
459 """Implement the methods that are common to all restriction enzymes. 460 461 All the methods are classmethod. 462 463 For internal use only. Not meant to be instantiate. 464 """ 465 466 @classmethod
467 - def search(cls, dna, linear=True):
468 """RE.search(dna, linear=True) -> list. 469 470 return a list of all the site of RE in dna. Compensate for circular 471 sequences and so on. 472 473 dna must be a Bio.Seq.Seq instance or a Bio.Seq.MutableSeq instance. 474 475 if linear is False, the restriction sites than span over the boundaries 476 will be included. 477 478 The positions are the first base of the 3' fragment, 479 i.e. the first base after the position the enzyme will cut. 480 """ 481 # 482 # Separating search from _search allow a (very limited) optimisation 483 # of the search when using a batch of restriction enzymes. 484 # in this case the DNA is tested once by the class which implements 485 # the batch instead of being tested by each enzyme single. 486 # see RestrictionBatch.search() for example. 487 # 488 if isinstance(