Package Bio :: Package Restriction :: Module Restriction
[hide private]
[frames] | no frames]

Source Code for Module Bio.Restriction.Restriction

   1  #!/usr/bin/env python 
   2  # 
   3  #      Restriction Analysis Libraries. 
   4  #      Copyright (C) 2004. Frederic Sohm. 
   5  # 
   6  # This code is part of the Biopython distribution and governed by its 
   7  # license.  Please see the LICENSE file that should have been included 
   8  # as part of this package. 
   9  # 
  10   
  11  """Restriction Enzyme classes. 
  12   
  13  Notes about the diverses class of the restriction enzyme implementation:: 
  14   
  15              RestrictionType is the type of all restriction enzymes. 
  16          ---------------------------------------------------------------------------- 
  17              AbstractCut implements some methods that are common to all enzymes. 
  18          ---------------------------------------------------------------------------- 
  19              NoCut, OneCut,TwoCuts   represent the number of double strand cuts 
  20                                      produced by the enzyme. 
  21                                      they correspond to the 4th field of the 
  22                                      rebase record emboss_e.NNN. 
  23                      0->NoCut    : the enzyme is not characterised. 
  24                      2->OneCut   : the enzyme produce one double strand cut. 
  25                      4->TwoCuts  : two double strand cuts. 
  26          ---------------------------------------------------------------------------- 
  27              Meth_Dep, Meth_Undep    represent the methylation susceptibility to 
  28                                      the enzyme. 
  29                                      Not implemented yet. 
  30          ---------------------------------------------------------------------------- 
  31              Palindromic,            if the site is palindromic or not. 
  32              NotPalindromic          allow some optimisations of the code. 
  33                                      No need to check the reverse strand 
  34                                      with palindromic sites. 
  35          ---------------------------------------------------------------------------- 
  36              Unknown, Blunt,         represent the overhang. 
  37              Ov5, Ov3                Unknown is here for symmetry reasons and 
  38                                      correspond to enzymes that are not 
  39                                      characterised in rebase. 
  40          ---------------------------------------------------------------------------- 
  41              Defined, Ambiguous,     represent the sequence of the overhang. 
  42              NotDefined 
  43                                      NotDefined is for enzymes not characterised 
  44                                      in rebase. 
  45   
  46                                      Defined correspond to enzymes that display 
  47                                      a constant overhang whatever the sequence. 
  48                                      ex : EcoRI. G^AATTC -> overhang :AATT 
  49                                                  CTTAA^G 
  50   
  51                                      Ambiguous : the overhang varies with the 
  52                                      sequence restricted. 
  53                                      Typically enzymes which cut outside their 
  54                                      restriction site or (but not always) 
  55                                      inside an ambiguous site. 
  56                                      ex: 
  57                                      AcuI CTGAAG(22/20)  -> overhang : NN 
  58                                      AasI GACNNN^NNNGTC  -> overhang : NN 
  59                                           CTGN^NNNNNCAG 
  60   
  61                  note : these 3 classes refers to the overhang not the site. 
  62                     So the enzyme ApoI (RAATTY) is defined even if its 
  63                     restriction site is ambiguous. 
  64   
  65                          ApoI R^AATTY -> overhang : AATT -> Defined 
  66                               YTTAA^R 
  67                     Accordingly, blunt enzymes are always Defined even 
  68                     when they cut outside their restriction site. 
  69          ---------------------------------------------------------------------------- 
  70              Not_available,          as found in rebase file emboss_r.NNN files. 
  71              Commercially_available 
  72                                      allow the selection of the enzymes 
  73                                      according to their suppliers to reduce the 
  74                                      quantity of results. 
  75                                      Also will allow the implementation of 
  76                                      buffer compatibility tables. Not 
  77                                      implemented yet. 
  78   
  79                                      the list of suppliers is extracted from 
  80                                      emboss_s.NNN 
  81          ---------------------------------------------------------------------------- 
  82   
  83  """ 
  84   
  85  from __future__ import print_function 
  86   
  87  import warnings 
  88   
  89  from Bio._py3k import zip 
  90  from Bio._py3k import filter 
  91  from Bio._py3k import range 
  92   
  93  import re 
  94  import itertools 
  95   
  96  from Bio.Seq import Seq, MutableSeq 
  97  from Bio.Restriction.Restriction_Dictionary import rest_dict as enzymedict 
  98  from Bio.Restriction.Restriction_Dictionary import typedict 
  99  from Bio.Restriction.Restriction_Dictionary import suppliers as suppliers_dict 
 100  from Bio.Restriction.PrintFormat import PrintFormat 
 101  from Bio import BiopythonWarning 
102 103 104 # Used to use Bio.Restriction.DNAUtils.check_bases (and expose it under this 105 # namespace), but have deprecated that module. 106 107 108 -def _check_bases(seq_string):
109 """Check characters in a string (PRIVATE). 110 111 Remove digits and white space present in string. Allows any valid ambiguous 112 IUPAC DNA single letters codes (ABCDGHKMNRSTVWY, lower case are converted). 113 114 Other characters (e.g. symbols) trigger a TypeError. 115 116 Returns the string WITH A LEADING SPACE (!). This is for backwards 117 compatibility, and may in part be explained by the fact that 118 Bio.Restriction doesn't use zero based counting. 119 """ 120 # Remove white space and make upper case: 121 seq_string = "".join(seq_string.split()).upper() 122 # Remove digits 123 for c in "0123456789": 124 seq_string = seq_string.replace(c, "") 125 # Check only allowed IUPAC letters 126 if not set(seq_string).issubset(set("ABCDGHKMNRSTVWY")): 127 raise TypeError("Invalid character found in %s" % repr(seq_string)) 128 return " " + seq_string
129 130 131 matching = {'A': 'ARWMHVDN', 'C': 'CYSMHBVN', 'G': 'GRSKBVDN', 132 'T': 'TYWKHBDN', 'R': 'ABDGHKMNSRWV', 'Y': 'CBDHKMNSTWVY', 133 'W': 'ABDHKMNRTWVY', 'S': 'CBDGHKMNSRVY', 'M': 'ACBDHMNSRWVY', 134 'K': 'BDGHKNSRTWVY', 'H': 'ACBDHKMNSRTWVY', 135 'B': 'CBDGHKMNSRTWVY', 'V': 'ACBDGHKMNSRWVY', 136 'D': 'ABDGHKMNSRTWVY', 'N': 'ACBDGHKMNSRTWVY'} 137 138 DNA = Seq
139 140 141 -class FormattedSeq(object):
142 """FormattedSeq(seq, [linear=True])-> new FormattedSeq. 143 144 Translate a Bio.Seq into a formatted sequence to be used with Restriction. 145 146 Roughly: remove anything which is not IUPAC alphabet and then add a space 147 in front of the sequence to get a biological index instead of a 148 python index (i.e. index of the first base is 1 not 0). 149 150 Retains information about the shape of the molecule linear (default) or 151 circular. Restriction sites are search over the edges of circular sequence. 152 """ 153
154 - def __init__(self, seq, linear=True):
155 """FormattedSeq(seq, [linear=True])-> new FormattedSeq. 156 157 seq is either a Bio.Seq, Bio.MutableSeq or a FormattedSeq. 158 if seq is a FormattedSeq, linear will have no effect on the 159 shape of the sequence. 160 """ 161 if isinstance(seq, (Seq, MutableSeq)): 162 stringy = str(seq) 163 self.lower = stringy.islower() 164 # Note this adds a leading space to the sequence (!) 165 self.data = _check_bases(stringy) 166 self.linear = linear 167 self.klass = seq.__class__ 168 self.alphabet = seq.alphabet 169 elif isinstance(seq, FormattedSeq): 170 self.lower = seq.lower 171 self.data = seq.data 172 self.linear = seq.linear 173 self.alphabet = seq.alphabet 174 self.klass = seq.klass 175 else: 176 raise TypeError('expected Seq or MutableSeq, got %s' % type(seq))
177
178 - def __len__(self):
179 return len(self.data) - 1
180
181 - def __repr__(self):
182 return 'FormattedSeq(%s, linear=%s)' % (repr(self[1:]), 183 repr(self.linear))
184
185 - def __eq__(self, other):
186 if isinstance(other, FormattedSeq): 187 if repr(self) == repr(other): 188 return True 189 else: 190 return False 191 return False
192
193 - def circularise(self):
194 """Circularise sequence in place.""" 195 self.linear = False 196 return
197
198 - def linearise(self):
199 """Linearise sequence in place.""" 200 self.linear = True 201 return
202
203 - def to_linear(self):
204 """Make a new instance of sequence as linear.""" 205 new = self.__class__(self) 206 new.linear = True 207 return new
208
209 - def to_circular(self):
210 """Make a new instance of sequence as circular.""" 211 new = self.__class__(self) 212 new.linear = False 213 return new
214
215 - def is_linear(self):
216 """Return if sequence is linear (True) or circular (False).""" 217 return self.linear
218
219 - def finditer(self, pattern, size):
220 """Return a list of a given pattern which occurs in the sequence. 221 222 The list is made of tuple (location, pattern.group). 223 The latter is used with non palindromic sites. 224 Pattern is the regular expression pattern corresponding to the 225 enzyme restriction site. 226 Size is the size of the restriction enzyme recognition-site size. 227 """ 228 if self.is_linear(): 229 data = self.data 230 else: 231 data = self.data + self.data[1:size] 232 return [(i.start(), i.group) for i in re.finditer(pattern, data)]
233
234 - def __getitem__(self, i):
235 if self.lower: 236 return self.klass((self.data[i]).lower(), self.alphabet) 237 return self.klass(self.data[i], self.alphabet)
238
239 240 -class RestrictionType(type):
241 """RestrictionType. Type from which all enzyme classes are derived. 242 243 Implement the operator methods. 244 """ 245
246 - def __init__(cls, name='', bases=(), dct=None):
247 """Initialize RestrictionType instance. 248 249 Not intended to be used in normal operation. The enzymes are 250 instantiated when importing the module. 251 See below. 252 """ 253 if "-" in name: 254 raise ValueError("Problem with hyphen in %s as enzyme name" 255 % repr(name)) 256 # 2011/11/26 - Nobody knows what this call was supposed to accomplish, 257 # but all unit tests seem to pass without it. 258 # super(RestrictionType, cls).__init__(cls, name, bases, dct) 259 try: 260 cls.compsite = re.compile(cls.compsite) 261 except Exception: 262 raise ValueError("Problem with regular expression, re.compiled(%s)" 263 % repr(cls.compsite))
264
265 - def __add__(cls, other):
266 """Add restriction enzyme to a RestrictionBatch(). 267 268 If other is an enzyme returns a batch of the two enzymes. 269 If other is already a RestrictionBatch add enzyme to it. 270 """ 271 if isinstance(other, RestrictionType): 272 return RestrictionBatch([cls, other]) 273 elif isinstance(other, RestrictionBatch): 274 return other.add_nocheck(cls) 275 else: 276 raise TypeError
277
278 - def __div__(cls, other):
279 """Override '/' operator to use as search method. 280 281 >>> EcoRI/Seq('GAATTC') 282 [2] 283 Returns RE.search(other). 284 """ 285 return cls.search(other)
286
287 - def __rdiv__(cls, other):
288 """Override division with reversed operands to use as search method. 289 290 >>> Seq('GAATTC')/EcoRI 291 [2] 292 Returns RE.search(other). 293 """ 294 return cls.search(other)
295
296 - def __truediv__(cls, other):
297 """Override Python 3 division operator to use as search method. 298 299 Like __div__. 300 """ 301 return cls.search(other)
302
303 - def __rtruediv__(cls, other):
304 """As __truediv___, with reversed operands. 305 306 Like __rdiv__. 307 """ 308 return cls.search(other)
309
310 - def __floordiv__(cls, other):
311 """Override '//' operator to use as catalyse method. 312 313 >>> EcoRI//Seq('GAATTC') 314 (Seq('G', Alphabet()), Seq('AATTC', Alphabet())) 315 Returns RE.catalyse(other). 316 """ 317 return cls.catalyse(other)
318
319 - def __rfloordiv__(cls, other):
320 """As __floordiv__, with reversed operands. 321 322 >>> Seq('GAATTC')//EcoRI 323 (Seq('G', Alphabet()), Seq('AATTC', Alphabet())) 324 Returns RE.catalyse(other). 325 """ 326 return cls.catalyse(other)
327
328 - def __str__(cls):
329 """Return the name of the enzyme as string.""" 330 return cls.__name__
331
332 - def __repr__(cls):
333 """Implement repr method. 334 335 Used with eval or exec will instantiate the enzyme. 336 """ 337 return "%s" % cls.__name__
338
339 - def __len__(cls):
340 """Return lenght of recognition site of enzyme as int.""" 341 return cls.size
342
343 - def __hash__(cls):
344 # Python default is to use id(...) 345 # This is consistent with the __eq__ implementation 346 return id(cls)
347
348 - def __eq__(cls, other):
349 """Override '==' operator. 350 351 True if RE and other are the same enzyme. 352 353 Specifically this checks they are the same Python object. 354 """ 355 # assert (id(cls)==id(other)) == (other is cls) == (cls is other) 356 return id(cls) == id(other)
357
358 - def __ne__(cls, other):
359 """Override '!=' operator. 360 361 Isoschizomer strict (same recognition site, same restriction) -> False 362 All the other-> True 363 364 WARNING - This is not the inverse of the __eq__ method 365 >>> SacI != SstI # true isoschizomers 366 False 367 >>> SacI == SstI 368 False 369 """ 370 if not isinstance(other, RestrictionType): 371 return True 372 elif cls.charac == other.charac: 373 return False 374 else: 375 return True
376
377 - def __rshift__(cls, other):
378 """Override '>>' operator to test for neoschizomers. 379 380 neoschizomer : same recognition site, different restriction. -> True 381 all the others : -> False 382 >>> SmaI >> XmaI 383 True 384 """ 385 if not isinstance(other, RestrictionType): 386 return False 387 elif cls.site == other.site and cls.charac != other.charac: 388 return True 389 else: 390 return False
391
392 - def __mod__(cls, other):
393 """Override '%' operator to test for compatible overhangs. 394 395 True if a and b have compatible overhang. 396 >>> XhoI % SalI 397 True 398 """ 399 if not isinstance(other, RestrictionType): 400 raise TypeError( 401 'expected RestrictionType, got %s instead' % type(other)) 402 return cls._mod1(other)
403
404 - def __ge__(cls, other):
405 """Compare length of recognition site of two enzymes. 406 407 Override '>='. a is greater or equal than b if the a site is longer 408 than b site. If their site have the same length sort by alphabetical 409 order of their names. 410 >>> EcoRI.size 411 6 412 >>> EcoRV.size 413 6 414 >>> EcoRI >= EcoRV 415 False 416 """ 417 if not isinstance(other, RestrictionType): 418 raise NotImplementedError 419 if len(cls) > len(other): 420 return True 421 elif cls.size == len(other) and cls.__name__ >= other.__name__: 422 return True 423 else: 424 return False
425
426 - def __gt__(cls, other):
427 """Compare length of recognition site of two enzymes. 428 429 Override '>'. Sorting order: 430 1. size of the recognition site. 431 2. if equal size, alphabetical order of the names. 432 433 """ 434 if not isinstance(other, RestrictionType): 435 raise NotImplementedError 436 if len(cls) > len(other): 437 return True 438 elif cls.size == len(other) and cls.__name__ > other.__name__: 439 return True 440 else: 441 return False
442
443 - def __le__(cls, other):
444 """Compare length of recognition site of two enzymes. 445 446 Override '<='. Sorting order: 447 1. size of the recognition site. 448 2. if equal size, alphabetical order of the names. 449 450 """ 451 if not isinstance(other, RestrictionType): 452 raise NotImplementedError 453 elif len(cls) < len(other): 454 return True 455 elif len(cls) == len(other) and cls.__name__ <= other.__name__: 456 return True 457 else: 458 return False
459
460 - def __lt__(cls, other):
461 """Compare length of recognition site of two enzymes. 462 463 Override '<'. Sorting order: 464 1. size of the recognition site. 465 2. if equal size, alphabetical order of the names. 466 467 """ 468 if not isinstance(other, RestrictionType): 469 raise NotImplementedError 470 elif len(cls) < len(other): 471 return True 472 elif len(cls) == len(other) and cls.__name__ < other.__name__: 473 return True 474 else: 475 return False
476
477 478 -class AbstractCut(RestrictionType):
479 """Implement the methods that are common to all restriction enzymes. 480 481 All the methods are classmethod. 482 483 For internal use only. Not meant to be instantiated. 484 """ 485 486 @classmethod
487 - def search(cls, dna, linear=True):
488 """Return a list of cutting sites of the enzyme in the sequence. 489 490 Compensate for circular sequences and so on. 491 492 dna must be a Bio.Seq.Seq instance or a Bio.Seq.MutableSeq instance. 493 494 If linear is False, the restriction sites that span over the boundaries 495 will be included. 496 497 The positions are the first base of the 3' fragment, 498 i.e. the first base after the position the enzyme will cut. 499 """ 500 # 501 # Separating search from _search allow a (very limited) optimisation 502 # of the search when using a batch of restriction enzymes. 503 # in this case the DNA is tested once by the class which implements 504 # the batch instead of being tested by each enzyme single. 505 # see RestrictionBatch.search() for example. 506 # 507 if isinstance(