Package Bio :: Package Restriction :: Module Restriction
[hide private]
[frames] | no frames]

Source Code for Module Bio.Restriction.Restriction

   1  #!/usr/bin/env python 
   2  # 
   3  #      Restriction Analysis Libraries. 
   4  #      Copyright (C) 2004. Frederic Sohm. 
   5  # 
   6  # This code is part of the Biopython distribution and governed by its 
   7  # license.  Please see the LICENSE file that should have been included 
   8  # as part of this package. 
   9  # 
  10   
  11  """Restriction Enzyme classes. 
  12   
  13  Notes about the diverses class of the restriction enzyme implementation:: 
  14   
  15              RestrictionType is the type of all restriction enzymes. 
  16          ---------------------------------------------------------------------------- 
  17              AbstractCut implements some methods that are common to all enzymes. 
  18          ---------------------------------------------------------------------------- 
  19              NoCut, OneCut,TwoCuts   represent the number of double strand cuts 
  20                                      produced by the enzyme. 
  21                                      they correspond to the 4th field of the rebase 
  22                                      record emboss_e.NNN. 
  23                      0->NoCut    : the enzyme is not characterised. 
  24                      2->OneCut   : the enzyme produce one double strand cut. 
  25                      4->TwoCuts  : two double strand cuts. 
  26          ---------------------------------------------------------------------------- 
  27              Meth_Dep, Meth_Undep    represent the methylation susceptibility to 
  28                                      the enzyme. 
  29                                      Not implemented yet. 
  30          ---------------------------------------------------------------------------- 
  31              Palindromic,            if the site is palindromic or not. 
  32              NotPalindromic          allow some optimisations of the code. 
  33                                      No need to check the reverse strand 
  34                                      with palindromic sites. 
  35          ---------------------------------------------------------------------------- 
  36              Unknown, Blunt,         represent the overhang. 
  37              Ov5, Ov3                Unknown is here for symetry reasons and 
  38                                      correspond to enzymes that are not characterised 
  39                                      in rebase. 
  40          ---------------------------------------------------------------------------- 
  41              Defined, Ambiguous,     represent the sequence of the overhang. 
  42              NotDefined 
  43                                      NotDefined is for enzymes not characterised in 
  44                                      rebase. 
  45   
  46                                      Defined correspond to enzymes that display a 
  47                                      constant overhang whatever the sequence. 
  48                                      ex : EcoRI. G^AATTC -> overhang :AATT 
  49                                                  CTTAA^G 
  50   
  51                                      Ambiguous : the overhang varies with the 
  52                                      sequence restricted. 
  53                                      Typically enzymes which cut outside their 
  54                                      restriction site or (but not always) 
  55                                      inside an ambiguous site. 
  56                                      ex: 
  57                                      AcuI CTGAAG(22/20)  -> overhang : NN 
  58                                      AasI GACNNN^NNNGTC  -> overhang : NN 
  59                                           CTGN^NNNNNCAG 
  60   
  61                  note : these 3 classes refers to the overhang not the site. 
  62                     So the enzyme ApoI (RAATTY) is defined even if its restriction 
  63                     site is ambiguous. 
  64   
  65                          ApoI R^AATTY -> overhang : AATT -> Defined 
  66                               YTTAA^R 
  67                     Accordingly, blunt enzymes are always Defined even 
  68                     when they cut outside their restriction site. 
  69          ---------------------------------------------------------------------------- 
  70              Not_available,          as found in rebase file emboss_r.NNN files. 
  71              Commercially_available 
  72                                      allow the selection of the enzymes according to 
  73                                      their suppliers to reduce the quantity 
  74                                      of results. 
  75                                      Also will allow the implementation of buffer 
  76                                      compatibility tables. Not implemented yet. 
  77   
  78                                      the list of suppliers is extracted from 
  79                                      emboss_s.NNN 
  80          ---------------------------------------------------------------------------- 
  81  """ 
  82   
  83  from __future__ import print_function 
  84  from Bio._py3k import zip 
  85  from Bio._py3k import filter 
  86  from Bio._py3k import range 
  87   
  88  import re 
  89  import itertools 
  90   
  91  from Bio.Seq import Seq, MutableSeq 
  92  from Bio.Alphabet import IUPAC 
  93   
  94  from Bio.Restriction.Restriction_Dictionary import rest_dict as enzymedict 
  95  from Bio.Restriction.Restriction_Dictionary import typedict 
  96  from Bio.Restriction.Restriction_Dictionary import suppliers as suppliers_dict 
  97  # TODO: Consider removing this wildcard import. 
  98  from Bio.Restriction.RanaConfig import * 
  99  from Bio.Restriction.PrintFormat import PrintFormat 
 100   
 101  __docformat__ = "restructuredtext en" 
102 103 # Used to use Bio.Restriction.DNAUtils.check_bases (and expose it under this 104 # namespace), but have deprecated that module. 105 106 107 -def _check_bases(seq_string):
108 """Check characters in a string (PRIVATE). 109 110 Remove digits and white space present in string. Allows any valid ambiguous 111 IUPAC DNA single letters codes (ABCDGHKMNRSTVWY, lower case are converted). 112 113 Other characters (e.g. symbols) trigger a TypeError. 114 115 Returns the string WITH A LEADING SPACE (!). This is for backwards 116 compatibility, and may in part be explained by the fact that 117 Bio.Restriction doesn't use zero based counting. 118 """ 119 # Remove white space and make upper case: 120 seq_string = "".join(seq_string.split()).upper() 121 # Remove digits 122 for c in "0123456789": 123 seq_string = seq_string.replace(c, "") 124 # Check only allowed IUPAC letters 125 if not set(seq_string).issubset(set("ABCDGHKMNRSTVWY")): 126 raise TypeError("Invalid character found in %s" % repr(seq_string)) 127 return " " + seq_string
128 129 130 matching = {'A': 'ARWMHVDN', 'C': 'CYSMHBVN', 'G': 'GRSKBVDN', 131 'T': 'TYWKHBDN', 'R': 'ABDGHKMNSRWV', 'Y': 'CBDHKMNSTWVY', 132 'W': 'ABDHKMNRTWVY', 'S': 'CBDGHKMNSRVY', 'M': 'ACBDHMNSRWVY', 133 'K': 'BDGHKNSRTWVY', 'H': 'ACBDHKMNSRTWVY', 134 'B': 'CBDGHKMNSRTWVY', 'V': 'ACBDGHKMNSRWVY', 135 'D': 'ABDGHKMNSRTWVY', 'N': 'ACBDGHKMNSRTWVY'} 136 137 DNA = Seq
138 139 140 -class FormattedSeq(object):
141 """FormattedSeq(seq, [linear=True])-> new FormattedSeq. 142 143 Translate a Bio.Seq into a formatted sequence to be used with Restriction. 144 145 Roughly: 146 remove anything which is not IUPAC alphabet and then add a space 147 in front of the sequence to get a biological index instead of a 148 python index (i.e. index of the first base is 1 not 0). 149 150 Retains information about the shape of the molecule linear (default) 151 or circular. Restriction sites are search over the edges of circular 152 sequence.""" 153
154 - def __init__(self, seq, linear=True):
155 """FormattedSeq(seq, [linear=True])-> new FormattedSeq. 156 157 seq is either a Bio.Seq, Bio.MutableSeq or a FormattedSeq. 158 if seq is a FormattedSeq, linear will have no effect on the 159 shape of the sequence.""" 160 if isinstance(seq, Seq) or isinstance(seq, MutableSeq): 161 stringy = str(seq) 162 self.lower = stringy.islower() 163 # Note this adds a leading space to the sequence (!) 164 self.data = _check_bases(stringy) 165 self.linear = linear 166 self.klass = seq.__class__ 167 self.alphabet = seq.alphabet 168 elif isinstance(seq, FormattedSeq): 169 self.lower = seq.lower 170 self.data = seq.data 171 self.linear = seq.linear 172 self.alphabet = seq.alphabet 173 self.klass = seq.klass 174 else: 175 raise TypeError('expected Seq or MutableSeq, got %s' % type(seq))
176
177 - def __len__(self):
178 return len(self.data) - 1
179
180 - def __repr__(self):
181 return 'FormattedSeq(%s, linear=%s)' % (repr(self[1:]), repr(self.linear))
182
183 - def __eq__(self, other):
184 if isinstance(other, FormattedSeq): 185 if repr(self) == repr(other): 186 return True 187 else: 188 return False 189 return False
190
191 - def circularise(self):
192 """FS.circularise() -> circularise FS""" 193 self.linear = False 194 return
195
196 - def linearise(self):
197 """FS.linearise() -> linearise FS""" 198 self.linear = True 199 return
200
201 - def to_linear(self):
202 """FS.to_linear() -> new linear FS instance""" 203 new = self.__class__(self) 204 new.linear = True 205 return new
206
207 - def to_circular(self):
208 """FS.to_circular() -> new circular FS instance""" 209 new = self.__class__(self) 210 new.linear = False 211 return new
212
213 - def is_linear(self):
214 """FS.is_linear() -> bool. 215 216 True if the sequence will analysed as a linear sequence.""" 217 return self.linear
218
219 - def finditer(self, pattern, size):
220 """FS.finditer(pattern, size) -> list. 221 222 return a list of pattern into the sequence. 223 the list is made of tuple (location, pattern.group). 224 the latter is used with non palindromic sites. 225 pattern is the regular expression pattern corresponding to the 226 enzyme restriction site. 227 size is the size of the restriction enzyme recognition-site size.""" 228 if self.is_linear(): 229 data = self.data 230 else: 231 data = self.data + self.data[1:size] 232 return [(i.start(), i.group) for i in re.finditer(pattern, data)]
233
234 - def __getitem__(self, i):
235 if self.lower: 236 return self.klass((self.data[i]).lower(), self.alphabet) 237 return self.klass(self.data[i], self.alphabet)
238
239 240 -class RestrictionType(type):
241 """RestrictionType. Type from which derives all enzyme classes. 242 243 Implement the operator methods.""" 244
245 - def __init__(cls, name='', bases=(), dct={}):
246 """RE(name, bases, dct) -> RestrictionType instance. 247 248 Not intended to be used in normal operation. The enzymes are 249 instantiated when importing the module. 250 251 see below.""" 252 if "-" in name: 253 raise ValueError("Problem with hyphen in %s as enzyme name" 254 % repr(name)) 255 # 2011/11/26 - Nobody knows what this call was supposed to accomplish, 256 # but all unit tests seem to pass without it. 257 # super(RestrictionType, cls).__init__(cls, name, bases, dct) 258 try: 259 cls.compsite = re.compile(cls.compsite) 260 except Exception as err: 261 raise ValueError("Problem with regular expression, re.compiled(%s)" 262 % repr(cls.compsite))
263
264 - def __add__(cls, other):
265 """RE.__add__(other) -> RestrictionBatch(). 266 267 if other is an enzyme returns a batch of the two enzymes. 268 if other is already a RestrictionBatch add enzyme to it.""" 269 if isinstance(other, RestrictionType): 270 return RestrictionBatch([cls, other]) 271 elif isinstance(other, RestrictionBatch): 272 return other.add_nocheck(cls) 273 else: 274 raise TypeError
275
276 - def __div__(cls, other):
277 """RE.__div__(other) -> list. 278 279 RE/other 280 returns RE.search(other).""" 281 return cls.search(other)
282
283 - def __rdiv__(cls, other):
284 """RE.__rdiv__(other) -> list. 285 286 other/RE 287 returns RE.search(other).""" 288 return cls.search(other)
289
290 - def __truediv__(cls, other):
291 """RE.__truediv__(other) -> list. 292 293 RE/other 294 returns RE.search(other).""" 295 return cls.search(other)
296
297 - def __rtruediv__(cls, other):
298 """RE.__rtruediv__(other) -> list. 299 300 other/RE 301 returns RE.search(other).""" 302 return cls.search(other)
303
304 - def __floordiv__(cls, other):
305 """RE.__floordiv__(other) -> list. 306 307 RE//other 308 returns RE.catalyse(other).""" 309 return cls.catalyse(other)
310
311 - def __rfloordiv__(cls, other):
312 """RE.__rfloordiv__(other) -> list. 313 314 other//RE 315 returns RE.catalyse(other).""" 316 return cls.catalyse(other)
317
318 - def __str__(cls):
319 """RE.__str__() -> str. 320 321 return the name of the enzyme.""" 322 return cls.__name__
323
324 - def __repr__(cls):
325 """RE.__repr__() -> str. 326 327 used with eval or exec will instantiate the enzyme.""" 328 return "%s" % cls.__name__
329
330 - def __len__(cls):
331 """RE.__len__() -> int. 332 333 length of the recognition site.""" 334 return cls.size
335
336 - def __hash__(cls):
337 # Python default is to use id(...) 338 # This is consistent with the __eq__ implementation 339 return id(cls)
340
341 - def __eq__(cls, other):
342 """RE == other -> bool 343 344 True if RE and other are the same enzyme. 345 346 Specifically this checks they are the same Python object. 347 """ 348 # assert (id(cls)==id(other)) == (other is cls) == (cls is other) 349 return id(cls) == id(other)
350
351 - def __ne__(cls, other):
352 """RE != other -> bool. 353 isoschizomer strict, same recognition site, same restriction -> False 354 all the other-> True 355 356 WARNING - This is not the inverse of the __eq__ method. 357 """ 358 if not isinstance(other, RestrictionType): 359 return True 360 elif cls.charac == other.charac: 361 return False 362 else: 363 return True
364
365 - def __rshift__(cls, other):
366 """RE >> other -> bool. 367 368 neoschizomer : same recognition site, different restriction. -> True 369 all the others : -> False""" 370 if not isinstance(other, RestrictionType): 371 return False 372 elif cls.site == other.site and cls.charac != other.charac: 373 return True 374 else: 375 return False
376
377 - def __mod__(cls, other):
378 """a % b -> bool. 379 380 Test compatibility of the overhang of a and b. 381 True if a and b have compatible overhang.""" 382 if not isinstance(other, RestrictionType): 383 raise TypeError( 384 'expected RestrictionType, got %s instead' % type(other)) 385 return cls._mod1(other)
386
387 - def __ge__(cls, other):
388 """a >= b -> bool. 389 390 a is greater or equal than b if the a site is longer than b site. 391 if their site have the same length sort by alphabetical order of their 392 names.""" 393 if not isinstance(other, RestrictionType): 394 raise NotImplementedError 395 if len(cls) > len(other): 396 return True 397 elif cls.size == len(other) and cls.__name__ >= other.__name__: 398 return True 399 else: 400 return False
401
402 - def __gt__(cls, other):
403 """a > b -> bool. 404 405 sorting order: 406 1. size of the recognition site. 407 2. if equal size, alphabetical order of the names.""" 408 if not isinstance(other, RestrictionType): 409 raise NotImplementedError 410 if len(cls) > len(other): 411 return True 412 elif cls.size == len(other) and cls.__name__ > other.__name__: 413 return True 414 else: 415 return False
416
417 - def __le__(cls, other):
418 """a <= b -> bool. 419 420 sorting order: 421 1. size of the recognition site. 422 2. if equal size, alphabetical order of the names.""" 423 if not isinstance(other, RestrictionType): 424 raise NotImplementedError 425 elif len(cls) < len(other): 426 return True 427 elif len(cls) == len(other) and cls.__name__ <= other.__name__: 428 return True 429 else: 430 return False
431
432 - def __lt__(cls, other):
433 """a < b -> bool. 434 435 sorting order: 436 1. size of the recognition site. 437 2. if equal size, alphabetical order of the names.""" 438 if not isinstance(other, RestrictionType): 439 raise NotImplementedError 440 elif len(cls) < len(other): 441 return True 442 elif len(cls) == len(other) and cls.__name__ < other.__name__: 443 return True 444 else: 445 return False
446
447 448 -class AbstractCut(RestrictionType):
449 """Implement the methods that are common to all restriction enzymes. 450 451 All the methods are classmethod. 452 453 For internal use only. Not meant to be instantiate.""" 454 455 @classmethod
456 - def search(cls, dna, linear=True):
457 """RE.search(dna, linear=True) -> list. 458 459 return a list of all the site of RE in dna. Compensate for circular 460 sequences and so on. 461 462 dna must be a Bio.Seq.Seq instance or a Bio.Seq.MutableSeq instance. 463 464 if linear is False, the restriction sites than span over the boundaries 465 will be included. 466 467 The positions are the first base of the 3' fragment, 468 i.e. the first base after the position the enzyme will cut. """ 469 # 470 # Separating search from _search allow a (very limited) optimisation 471 # of the search when using a batch of restriction enzymes. 472 # in this case the DNA is tested once by the class which implements 473 # the batch instead of being tested by each enzyme single. 474 # see RestrictionBatch.search() for example. 475 # 476 if isinstance(dna, FormattedSeq): 477 cls.dna = dna 478 return cls._search() 479 else: 480 cls.