Package Bio :: Package Restriction :: Module Restriction
[hide private]
[frames] | no frames]

Source Code for Module Bio.Restriction.Restriction

   1  #!/usr/bin/env python 
   2  # 
   3  #      Restriction Analysis Libraries. 
   4  #      Copyright (C) 2004. Frederic Sohm. 
   5  # 
   6  # This code is part of the Biopython distribution and governed by its 
   7  # license.  Please see the LICENSE file that should have been included 
   8  # as part of this package. 
   9  # 
  10   
  11  """ Notes about the diverses class of the restriction enzyme implementation. 
  12   
  13          RestrictionType is the type of all restriction enzymes. 
  14      ---------------------------------------------------------------------------- 
  15          AbstractCut implements some methods that are common to all enzymes. 
  16      ---------------------------------------------------------------------------- 
  17          NoCut, OneCut,TwoCuts   represent the number of double strand cuts 
  18                                  produced by the enzyme. 
  19                                  they correspond to the 4th field of the rebase 
  20                                  record emboss_e.NNN. 
  21                  0->NoCut    : the enzyme is not characterised. 
  22                  2->OneCut   : the enzyme produce one double strand cut. 
  23                  4->TwoCuts  : two double strand cuts. 
  24      ---------------------------------------------------------------------------- 
  25          Meth_Dep, Meth_Undep    represent the methylation susceptibility to 
  26                                  the enzyme. 
  27                                  Not implemented yet. 
  28      ---------------------------------------------------------------------------- 
  29          Palindromic,            if the site is palindromic or not. 
  30          NotPalindromic          allow some optimisations of the code. 
  31                                  No need to check the reverse strand 
  32                                  with palindromic sites. 
  33      ---------------------------------------------------------------------------- 
  34          Unknown, Blunt,         represent the overhang. 
  35          Ov5, Ov3                Unknown is here for symetry reasons and 
  36                                  correspond to enzymes that are not characterised 
  37                                  in rebase. 
  38      ---------------------------------------------------------------------------- 
  39          Defined, Ambiguous,     represent the sequence of the overhang. 
  40          NotDefined 
  41                                  NotDefined is for enzymes not characterised in 
  42                                  rebase. 
  43   
  44                                  Defined correspond to enzymes that display a 
  45                                  constant overhang whatever the sequence. 
  46                                  ex : EcoRI. G^AATTC -> overhang :AATT 
  47                                              CTTAA^G 
  48   
  49                                  Ambiguous : the overhang varies with the 
  50                                  sequence restricted. 
  51                                  Typically enzymes which cut outside their 
  52                                  restriction site or (but not always) 
  53                                  inside an ambiguous site. 
  54                                  ex: 
  55                                  AcuI CTGAAG(22/20)  -> overhang : NN 
  56                                  AasI GACNNN^NNNGTC  -> overhang : NN 
  57                                       CTGN^NNNNNCAG 
  58   
  59              note : these 3 classes refers to the overhang not the site. 
  60                 So the enzyme ApoI (RAATTY) is defined even if its restriction 
  61                 site is ambiguous. 
  62   
  63                      ApoI R^AATTY -> overhang : AATT -> Defined 
  64                           YTTAA^R 
  65                 Accordingly, blunt enzymes are always Defined even 
  66                 when they cut outside their restriction site. 
  67      ---------------------------------------------------------------------------- 
  68          Not_available,          as found in rebase file emboss_r.NNN files. 
  69          Commercially_available 
  70                                  allow the selection of the enzymes according to 
  71                                  their suppliers to reduce the quantity 
  72                                  of results. 
  73                                  Also will allow the implementation of buffer 
  74                                  compatibility tables. Not implemented yet. 
  75   
  76                                  the list of suppliers is extracted from 
  77                                  emboss_s.NNN 
  78      ---------------------------------------------------------------------------- 
  79          """ 
  80   
  81  import re 
  82  import itertools 
  83   
  84  from Bio.Seq import Seq, MutableSeq 
  85  from Bio.Alphabet import IUPAC 
  86   
  87  from Bio.Restriction.Restriction_Dictionary import rest_dict as enzymedict 
  88  from Bio.Restriction.Restriction_Dictionary import typedict 
  89  from Bio.Restriction.Restriction_Dictionary import suppliers as suppliers_dict 
  90  # TODO: Consider removing this wildcard import. 
  91  from Bio.Restriction.RanaConfig import * 
  92  from Bio.Restriction.PrintFormat import PrintFormat 
93 94 95 #Used to use Bio.Restriction.DNAUtils.check_bases (and expose it under this 96 #namespace), but have deprecated that module. 97 -def _check_bases(seq_string):
98 """Check characters in a string (PRIVATE). 99 100 Remove digits and white space present in string. Allows any valid ambiguous 101 IUPAC DNA single letters codes (ABCDGHKMNRSTVWY, lower case are converted). 102 103 Other characters (e.g. symbols) trigger a TypeError. 104 105 Returns the string WITH A LEADING SPACE (!). This is for backwards 106 compatibility, and may in part be explained by the fact that 107 Bio.Restriction doesn't use zero based counting. 108 """ 109 #Remove white space and make upper case: 110 seq_string = "".join(seq_string.split()).upper() 111 #Remove digits 112 for c in "0123456789": 113 seq_string = seq_string.replace(c, "") 114 #Check only allowed IUPAC letters 115 if not set(seq_string).issubset(set("ABCDGHKMNRSTVWY")) : 116 raise TypeError("Invalid character found in %s" % repr(seq_string)) 117 return " " + seq_string
118 119 120 matching = {'A' : 'ARWMHVDN', 'C' : 'CYSMHBVN', 'G' : 'GRSKBVDN', 121 'T' : 'TYWKHBDN', 'R' : 'ABDGHKMNSRWV', 'Y' : 'CBDHKMNSTWVY', 122 'W' : 'ABDHKMNRTWVY', 'S' : 'CBDGHKMNSRVY', 'M' : 'ACBDHMNSRWVY', 123 'K' : 'BDGHKNSRTWVY', 'H' : 'ACBDHKMNSRTWVY', 124 'B' : 'CBDGHKMNSRTWVY', 'V' : 'ACBDGHKMNSRWVY', 125 'D' : 'ABDGHKMNSRTWVY', 'N' : 'ACBDGHKMNSRTWVY'} 126 127 DNA = Seq
128 129 130 -class FormattedSeq(object):
131 """FormattedSeq(seq, [linear=True])-> new FormattedSeq. 132 133 Translate a Bio.Seq into a formatted sequence to be used with Restriction. 134 135 Roughly: 136 remove anything which is not IUPAC alphabet and then add a space 137 in front of the sequence to get a biological index instead of a 138 python index (i.e. index of the first base is 1 not 0). 139 140 Retains information about the shape of the molecule linear (default) 141 or circular. Restriction sites are search over the edges of circular 142 sequence.""" 143
144 - def __init__(self, seq, linear = True):
145 """FormattedSeq(seq, [linear=True])-> new FormattedSeq. 146 147 seq is either a Bio.Seq, Bio.MutableSeq or a FormattedSeq. 148 if seq is a FormattedSeq, linear will have no effect on the 149 shape of the sequence.""" 150 if isinstance(seq, Seq) or isinstance(seq, MutableSeq): 151 stringy = str(seq) 152 self.lower = stringy.islower() 153 #Note this adds a leading space to the sequence (!) 154 self.data = _check_bases(stringy) 155 self.linear = linear 156 self.klass = seq.__class__ 157 self.alphabet = seq.alphabet 158 elif isinstance(seq, FormattedSeq): 159 self.lower = seq.lower 160 self.data = seq.data 161 self.linear = seq.linear 162 self.alphabet = seq.alphabet 163 self.klass = seq.klass 164 else: 165 raise TypeError('expected Seq or MutableSeq, got %s' % type(seq))
166
167 - def __len__(self):
168 return len(self.data) - 1
169
170 - def __repr__(self):
171 return 'FormattedSeq(%s, linear=%s)' %(repr(self[1:]), repr(self.linear))
172
173 - def __eq__(self, other):
174 if isinstance(other, FormattedSeq): 175 if repr(self) == repr(other): 176 return True 177 else: 178 return False 179 return False
180
181 - def circularise(self):
182 """FS.circularise() -> circularise FS""" 183 self.linear = False 184 return
185
186 - def linearise(self):
187 """FS.linearise() -> linearise FS""" 188 self.linear = True 189 return
190
191 - def to_linear(self):
192 """FS.to_linear() -> new linear FS instance""" 193 new = self.__class__(self) 194 new.linear = True 195 return new
196
197 - def to_circular(self):
198 """FS.to_circular() -> new circular FS instance""" 199 new = self.__class__(self) 200 new.linear = False 201 return new
202
203 - def is_linear(self):
204 """FS.is_linear() -> bool. 205 206 True if the sequence will analysed as a linear sequence.""" 207 return self.linear
208
209 - def finditer(self, pattern, size):
210 """FS.finditer(pattern, size) -> list. 211 212 return a list of pattern into the sequence. 213 the list is made of tuple (location, pattern.group). 214 the latter is used with non palindromic sites. 215 pattern is the regular expression pattern corresponding to the 216 enzyme restriction site. 217 size is the size of the restriction enzyme recognition-site size.""" 218 if self.is_linear(): 219 data = self.data 220 else: 221 data = self.data + self.data[1:size] 222 return [(i.start(), i.group) for i in re.finditer(pattern, data)]
223
224 - def __getitem__(self, i):
225 if self.lower: 226 return self.klass((self.data[i]).lower(), self.alphabet) 227 return self.klass(self.data[i], self.alphabet)
228
229 230 -class RestrictionType(type):
231 """RestrictionType. Type from which derives all enzyme classes. 232 233 Implement the operator methods.""" 234
235 - def __init__(cls, name='', bases=(), dct={}):
236 """RE(name, bases, dct) -> RestrictionType instance. 237 238 Not intended to be used in normal operation. The enzymes are 239 instantiated when importing the module. 240 241 see below.""" 242 if "-" in name : 243 raise ValueError("Problem with hyphen in %s as enzyme name" 244 % repr(name)) 245 # 2011/11/26 - Nobody knows what this call was supposed to accomplish, 246 # but all unit tests seem to pass without it. 247 # super(RestrictionType, cls).__init__(cls, name, bases, dct) 248 try : 249 cls.compsite = re.compile(cls.compsite) 250 except Exception, err : 251 raise ValueError("Problem with regular expression, re.compiled(%s)" 252 % repr(cls.compsite))
253
254 - def __add__(cls, other):
255 """RE.__add__(other) -> RestrictionBatch(). 256 257 if other is an enzyme returns a batch of the two enzymes. 258 if other is already a RestrictionBatch add enzyme to it.""" 259 if isinstance(other, RestrictionType): 260 return RestrictionBatch([cls, other]) 261 elif isinstance(other, RestrictionBatch): 262 return other.add_nocheck(cls) 263 else: 264 raise TypeError
265
266 - def __div__(cls, other):
267 """RE.__div__(other) -> list. 268 269 RE/other 270 returns RE.search(other).""" 271 return cls.search(other)
272
273 - def __rdiv__(cls, other):
274 """RE.__rdiv__(other) -> list. 275 276 other/RE 277 returns RE.search(other).""" 278 return cls.search(other)
279
280 - def __truediv__(cls, other):
281 """RE.__truediv__(other) -> list. 282 283 RE/other 284 returns RE.search(other).""" 285 return cls.search(other)
286
287 - def __rtruediv__(cls, other):
288 """RE.__rtruediv__(other) -> list. 289 290 other/RE 291 returns RE.search(other).""" 292 return cls.search(other)
293
294 - def __floordiv__(cls, other):
295 """RE.__floordiv__(other) -> list. 296 297 RE//other 298 returns RE.catalyse(other).""" 299 return cls.catalyse(other)
300
301 - def __rfloordiv__(cls, other):
302 """RE.__rfloordiv__(other) -> list. 303 304 other//RE 305 returns RE.catalyse(other).""" 306 return cls.catalyse(other)
307
308 - def __str__(cls):
309 """RE.__str__() -> str. 310 311 return the name of the enzyme.""" 312 return cls.__name__
313
314 - def __repr__(cls):
315 """RE.__repr__() -> str. 316 317 used with eval or exec will instantiate the enzyme.""" 318 return "%s" % cls.__name__
319
320 - def __len__(cls):
321 """RE.__len__() -> int. 322 323 length of the recognition site.""" 324 return cls.size
325
326 - def __hash__(cls):
327 #Python default is to use id(...) 328 #This is consistent with the __eq__ implementation 329 return id(cls)
330
331 - def __eq__(cls, other):
332 """RE == other -> bool 333 334 True if RE and other are the same enzyme. 335 336 Specifically this checks they are the same Python object. 337 """ 338 #assert (id(cls)==id(other)) == (other is cls) == (cls is other) 339 return id(cls)==id(other)
340
341 - def __ne__(cls, other):
342 """RE != other -> bool. 343 isoschizomer strict, same recognition site, same restriction -> False 344 all the other-> True 345 346 WARNING - This is not the inverse of the __eq__ method. 347 """ 348 if not isinstance(other, RestrictionType): 349 return True 350 elif cls.charac == other.charac: 351 return False 352 else: 353 return True
354
355 - def __rshift__(cls, other):
356 """RE >> other -> bool. 357 358 neoschizomer : same recognition site, different restriction. -> True 359 all the others : -> False""" 360 if not isinstance(other, RestrictionType): 361 return False 362 elif cls.site == other.site and cls.charac != other.charac: 363 return True 364 else: 365 return False
366
367 - def __mod__(cls, other):
368 """a % b -> bool. 369 370 Test compatibility of the overhang of a and b. 371 True if a and b have compatible overhang.""" 372 if not isinstance(other, RestrictionType): 373 raise TypeError( 374 'expected RestrictionType, got %s instead' % type(other)) 375 return cls._mod1(other)
376
377 - def __ge__(cls, other):
378 """a >= b -> bool. 379 380 a is greater or equal than b if the a site is longer than b site. 381 if their site have the same length sort by alphabetical order of their 382 names.""" 383 if not isinstance(other, RestrictionType): 384 raise NotImplementedError 385 if len(cls) > len(other): 386 return True 387 elif cls.size == len(other) and cls.__name__ >= other.__name__: 388 return True 389 else: 390 return False
391
392 - def __gt__(cls, other):
393 """a > b -> bool. 394 395 sorting order: 396 1. size of the recognition site. 397 2. if equal size, alphabetical order of the names.""" 398 if not isinstance(other, RestrictionType): 399 raise NotImplementedError 400 if len(cls) > len(other): 401 return True 402 elif cls.size == len(other) and cls.__name__ > other.__name__: 403 return True 404 else: 405 return False
406
407 - def __le__(cls, other):
408 """a <= b -> bool. 409 410 sorting order: 411 1. size of the recognition site. 412 2. if equal size, alphabetical order of the names.""" 413 if not isinstance(other, RestrictionType): 414 raise NotImplementedError 415 elif len(cls) < len(other): 416 return True 417 elif len(cls) == len(other) and cls.__name__ <= other.__name__: 418 return True 419 else: 420 return False
421
422 - def __lt__(cls, other):
423 """a < b -> bool. 424 425 sorting order: 426 1. size of the recognition site. 427 2. if equal size, alphabetical order of the names.""" 428 if not isinstance(other, RestrictionType): 429 raise NotImplementedError 430 elif len(cls) < len(other): 431 return True 432 elif len(cls) == len(other) and cls.__name__ < other.__name__: 433 return True 434 else: 435 return False
436
437 438 -class AbstractCut(RestrictionType):
439 """Implement the methods that are common to all restriction enzymes. 440 441 All the methods are classmethod. 442 443 For internal use only. Not meant to be instantiate.""" 444 445 @classmethod
446 - def search(cls, dna, linear=True):
447 """RE.search(dna, linear=True) -> list. 448 449 return a list of all the site of RE in dna. Compensate for circular 450 sequences and so on. 451 452 dna must be a Bio.Seq.Seq instance or a Bio.Seq.MutableSeq instance. 453 454 if linear is False, the restriction sites than span over the boundaries 455 will be included. 456 457 The positions are the first base of the 3' fragment, 458 i.e. the first base after the position the enzyme will cut. """ 459 # 460 # Separating search from _search allow a (very limited) optimisation 461 # of the search when using a batch of restriction enzymes. 462 # in this case the DNA is tested once by the class which implements 463 # the batch instead of being tested by each enzyme single. 464 # see RestrictionBatch.search() for example. 465 # 466 if isinstance(dna, FormattedSeq): 467 cls.dna = dna 468 return cls._search() 469 else: 470 cls.dna = FormattedSeq(