Package Bio :: Package Restriction :: Module Restriction
[hide private]
[frames] | no frames]

Source Code for Module Bio.Restriction.Restriction

   1  #!/usr/bin/env python 
   2  # 
   3  #      Restriction Analysis Libraries. 
   4  #      Copyright (C) 2004. Frederic Sohm. 
   5  # 
   6  # This code is part of the Biopython distribution and governed by its 
   7  # license.  Please see the LICENSE file that should have been included 
   8  # as part of this package. 
   9  # 
  10   
  11  """Restriction Enzyme classes. 
  12   
  13  Notes about the diverses class of the restriction enzyme implementation:: 
  14   
  15              RestrictionType is the type of all restriction enzymes. 
  16          ---------------------------------------------------------------------------- 
  17              AbstractCut implements some methods that are common to all enzymes. 
  18          ---------------------------------------------------------------------------- 
  19              NoCut, OneCut,TwoCuts   represent the number of double strand cuts 
  20                                      produced by the enzyme. 
  21                                      they correspond to the 4th field of the 
  22                                      rebase record emboss_e.NNN. 
  23                      0->NoCut    : the enzyme is not characterised. 
  24                      2->OneCut   : the enzyme produce one double strand cut. 
  25                      4->TwoCuts  : two double strand cuts. 
  26          ---------------------------------------------------------------------------- 
  27              Meth_Dep, Meth_Undep    represent the methylation susceptibility to 
  28                                      the enzyme. 
  29                                      Not implemented yet. 
  30          ---------------------------------------------------------------------------- 
  31              Palindromic,            if the site is palindromic or not. 
  32              NotPalindromic          allow some optimisations of the code. 
  33                                      No need to check the reverse strand 
  34                                      with palindromic sites. 
  35          ---------------------------------------------------------------------------- 
  36              Unknown, Blunt,         represent the overhang. 
  37              Ov5, Ov3                Unknown is here for symmetry reasons and 
  38                                      correspond to enzymes that are not 
  39                                      characterised in rebase. 
  40          ---------------------------------------------------------------------------- 
  41              Defined, Ambiguous,     represent the sequence of the overhang. 
  42              NotDefined 
  43                                      NotDefined is for enzymes not characterised 
  44                                      in rebase. 
  45   
  46                                      Defined correspond to enzymes that display 
  47                                      a constant overhang whatever the sequence. 
  48                                      ex : EcoRI. G^AATTC -> overhang :AATT 
  49                                                  CTTAA^G 
  50   
  51                                      Ambiguous : the overhang varies with the 
  52                                      sequence restricted. 
  53                                      Typically enzymes which cut outside their 
  54                                      restriction site or (but not always) 
  55                                      inside an ambiguous site. 
  56                                      ex: 
  57                                      AcuI CTGAAG(22/20)  -> overhang : NN 
  58                                      AasI GACNNN^NNNGTC  -> overhang : NN 
  59                                           CTGN^NNNNNCAG 
  60   
  61                  note : these 3 classes refers to the overhang not the site. 
  62                     So the enzyme ApoI (RAATTY) is defined even if its 
  63                     restriction site is ambiguous. 
  64   
  65                          ApoI R^AATTY -> overhang : AATT -> Defined 
  66                               YTTAA^R 
  67                     Accordingly, blunt enzymes are always Defined even 
  68                     when they cut outside their restriction site. 
  69          ---------------------------------------------------------------------------- 
  70              Not_available,          as found in rebase file emboss_r.NNN files. 
  71              Commercially_available 
  72                                      allow the selection of the enzymes 
  73                                      according to their suppliers to reduce the 
  74                                      quantity of results. 
  75                                      Also will allow the implementation of 
  76                                      buffer compatibility tables. Not 
  77                                      implemented yet. 
  78   
  79                                      the list of suppliers is extracted from 
  80                                      emboss_s.NNN 
  81          ---------------------------------------------------------------------------- 
  82  """ 
  83   
  84  from __future__ import print_function 
  85   
  86  import warnings 
  87   
  88  from Bio._py3k import zip 
  89  from Bio._py3k import filter 
  90  from Bio._py3k import range 
  91   
  92  import re 
  93  import itertools 
  94   
  95  from Bio.Seq import Seq, MutableSeq 
  96   
  97  from Bio.Restriction.Restriction_Dictionary import rest_dict as enzymedict 
  98  from Bio.Restriction.Restriction_Dictionary import typedict 
  99  from Bio.Restriction.Restriction_Dictionary import suppliers as suppliers_dict 
 100  from Bio.Restriction.RanaConfig import ConsoleWidth, NameWidth, Indent, MaxSize 
 101  from Bio.Restriction.RanaConfig import ftp_proxy, ftp_Rebase 
 102  from Bio.Restriction.RanaConfig import ftp_emb_e, ftp_emb_s, ftp_emb_r 
 103  from Bio.Restriction.PrintFormat import PrintFormat 
 104  from Bio import BiopythonWarning 
105 106 107 # Used to use Bio.Restriction.DNAUtils.check_bases (and expose it under this 108 # namespace), but have deprecated that module. 109 110 111 -def _check_bases(seq_string):
112 """Check characters in a string (PRIVATE). 113 114 Remove digits and white space present in string. Allows any valid ambiguous 115 IUPAC DNA single letters codes (ABCDGHKMNRSTVWY, lower case are converted). 116 117 Other characters (e.g. symbols) trigger a TypeError. 118 119 Returns the string WITH A LEADING SPACE (!). This is for backwards 120 compatibility, and may in part be explained by the fact that 121 Bio.Restriction doesn't use zero based counting. 122 """ 123 # Remove white space and make upper case: 124 seq_string = "".join(seq_string.split()).upper() 125 # Remove digits 126 for c in "0123456789": 127 seq_string = seq_string.replace(c, "") 128 # Check only allowed IUPAC letters 129 if not set(seq_string).issubset(set("ABCDGHKMNRSTVWY")): 130 raise TypeError("Invalid character found in %s" % repr(seq_string)) 131 return " " + seq_string
132 133 134 matching = {'A': 'ARWMHVDN', 'C': 'CYSMHBVN', 'G': 'GRSKBVDN', 135 'T': 'TYWKHBDN', 'R': 'ABDGHKMNSRWV', 'Y': 'CBDHKMNSTWVY', 136 'W': 'ABDHKMNRTWVY', 'S': 'CBDGHKMNSRVY', 'M': 'ACBDHMNSRWVY', 137 'K': 'BDGHKNSRTWVY', 'H': 'ACBDHKMNSRTWVY', 138 'B': 'CBDGHKMNSRTWVY', 'V': 'ACBDGHKMNSRWVY', 139 'D': 'ABDGHKMNSRTWVY', 'N': 'ACBDGHKMNSRTWVY'} 140 141 DNA = Seq
142 143 144 -class FormattedSeq(object):
145 """FormattedSeq(seq, [linear=True])-> new FormattedSeq. 146 147 Translate a Bio.Seq into a formatted sequence to be used with Restriction. 148 149 Roughly: 150 remove anything which is not IUPAC alphabet and then add a space 151 in front of the sequence to get a biological index instead of a 152 python index (i.e. index of the first base is 1 not 0). 153 154 Retains information about the shape of the molecule linear (default) 155 or circular. Restriction sites are search over the edges of circular 156 sequence. 157 """ 158
159 - def __init__(self, seq, linear=True):
160 """FormattedSeq(seq, [linear=True])-> new FormattedSeq. 161 162 seq is either a Bio.Seq, Bio.MutableSeq or a FormattedSeq. 163 if seq is a FormattedSeq, linear will have no effect on the 164 shape of the sequence. 165 """ 166 if isinstance(seq, (Seq, MutableSeq)): 167 stringy = str(seq) 168 self.lower = stringy.islower() 169 # Note this adds a leading space to the sequence (!) 170 self.data = _check_bases(stringy) 171 self.linear = linear 172 self.klass = seq.__class__ 173 self.alphabet = seq.alphabet 174 elif isinstance(seq, FormattedSeq): 175 self.lower = seq.lower 176 self.data = seq.data 177 self.linear = seq.linear 178 self.alphabet = seq.alphabet 179 self.klass = seq.klass 180 else: 181 raise TypeError('expected Seq or MutableSeq, got %s' % type(seq))
182
183 - def __len__(self):
184 return len(self.data) - 1
185
186 - def __repr__(self):
187 return 'FormattedSeq(%s, linear=%s)' % (repr(self[1:]), 188 repr(self.linear))
189
190 - def __eq__(self, other):
191 if isinstance(other, FormattedSeq): 192 if repr(self) == repr(other): 193 return True 194 else: 195 return False 196 return False
197
198 - def circularise(self):
199 """FS.circularise() -> circularise FS""" 200 self.linear = False 201 return
202
203 - def linearise(self):
204 """FS.linearise() -> linearise FS""" 205 self.linear = True 206 return
207
208 - def to_linear(self):
209 """FS.to_linear() -> new linear FS instance""" 210 new = self.__class__(self) 211 new.linear = True 212 return new
213
214 - def to_circular(self):
215 """FS.to_circular() -> new circular FS instance""" 216 new = self.__class__(self) 217 new.linear = False 218 return new
219
220 - def is_linear(self):
221 """FS.is_linear() -> bool. 222 223 True if the sequence will analysed as a linear sequence.""" 224 return self.linear
225
226 - def finditer(self, pattern, size):
227 """FS.finditer(pattern, size) -> list. 228 229 return a list of pattern into the sequence. 230 the list is made of tuple (location, pattern.group). 231 the latter is used with non palindromic sites. 232 pattern is the regular expression pattern corresponding to the 233 enzyme restriction site. 234 size is the size of the restriction enzyme recognition-site size. 235 """ 236 if self.is_linear(): 237 data = self.data 238 else: 239 data = self.data + self.data[1:size] 240 return [(i.start(), i.group) for i in re.finditer(pattern, data)]
241
242 - def __getitem__(self, i):
243 if self.lower: 244 return self.klass((self.data[i]).lower(), self.alphabet) 245 return self.klass(self.data[i], self.alphabet)
246
247 248 -class RestrictionType(type):
249 """RestrictionType. Type from which derives all enzyme classes. 250 251 Implement the operator methods. 252 """ 253
254 - def __init__(cls, name='', bases=(), dct=None):
255 """RE(name, bases, dct) -> RestrictionType instance. 256 257 Not intended to be used in normal operation. The enzymes are 258 instantiated when importing the module. 259 260 see below.""" 261 if "-" in name: 262 raise ValueError("Problem with hyphen in %s as enzyme name" 263 % repr(name)) 264 # 2011/11/26 - Nobody knows what this call was supposed to accomplish, 265 # but all unit tests seem to pass without it. 266 # super(RestrictionType, cls).__init__(cls, name, bases, dct) 267 try: 268 cls.compsite = re.compile(cls.compsite) 269 except Exception as err: 270 raise ValueError("Problem with regular expression, re.compiled(%s)" 271 % repr(cls.compsite))
272
273 - def __add__(cls, other):
274 """RE.__add__(other) -> RestrictionBatch(). 275 276 if other is an enzyme returns a batch of the two enzymes. 277 if other is already a RestrictionBatch add enzyme to it. 278 """ 279 if isinstance(other, RestrictionType): 280 return RestrictionBatch([cls, other]) 281 elif isinstance(other, RestrictionBatch): 282 return other.add_nocheck(cls) 283 else: 284 raise TypeError
285
286 - def __div__(cls, other):
287 """RE.__div__(other) -> list. 288 289 RE/other 290 returns RE.search(other).""" 291 return cls.search(other)
292
293 - def __rdiv__(cls, other):
294 """RE.__rdiv__(other) -> list. 295 296 other/RE 297 returns RE.search(other).""" 298 return cls.search(other)
299
300 - def __truediv__(cls, other):
301 """RE.__truediv__(other) -> list. 302 303 RE/other 304 returns RE.search(other).""" 305 return cls.search(other)
306
307 - def __rtruediv__(cls, other):
308 """RE.__rtruediv__(other) -> list. 309 310 other/RE 311 returns RE.search(other).""" 312 return cls.search(other)
313
314 - def __floordiv__(cls, other):
315 """RE.__floordiv__(other) -> list. 316 317 RE//other 318 returns RE.catalyse(other).""" 319 return cls.catalyse(other)
320
321 - def __rfloordiv__(cls, other):
322 """RE.__rfloordiv__(other) -> list. 323 324 other//RE 325 returns RE.catalyse(other).""" 326 return cls.catalyse(other)
327
328 - def __str__(cls):
329 """RE.__str__() -> str. 330 331 return the name of the enzyme.""" 332 return cls.__name__
333
334 - def __repr__(cls):
335 """RE.__repr__() -> str. 336 337 used with eval or exec will instantiate the enzyme.""" 338 return "%s" % cls.__name__
339
340 - def __len__(cls):
341 """RE.__len__() -> int. 342 343 length of the recognition site.""" 344 return cls.size
345
346 - def __hash__(cls):
347 # Python default is to use id(...) 348 # This is consistent with the __eq__ implementation 349 return id(cls)
350
351 - def __eq__(cls, other):
352 """RE == other -> bool 353 354 True if RE and other are the same enzyme. 355 356 Specifically this checks they are the same Python object. 357 """ 358 # assert (id(cls)==id(other)) == (other is cls) == (cls is other) 359 return id(cls) == id(other)
360
361 - def __ne__(cls, other):
362 """RE != other -> bool. 363 isoschizomer strict, same recognition site, same restriction -> False 364 all the other-> True 365 366 WARNING - This is not the inverse of the __eq__ method. 367 """ 368 if not isinstance(other, RestrictionType): 369 return True 370 elif cls.charac == other.charac: 371 return False 372 else: 373 return True
374
375 - def __rshift__(cls, other):
376 """RE >> other -> bool. 377 378 neoschizomer : same recognition site, different restriction. -> True 379 all the others : -> False 380 """ 381 if not isinstance(other, RestrictionType): 382 return False 383 elif cls.site == other.site and cls.charac != other.charac: 384 return True 385 else: 386 return False
387
388 - def __mod__(cls, other):
389 """a % b -> bool. 390 391 Test compatibility of the overhang of a and b. 392 True if a and b have compatible overhang. 393 """ 394 if not isinstance(other, RestrictionType): 395 raise TypeError( 396 'expected RestrictionType, got %s instead' % type(other)) 397 return cls._mod1(other)
398
399 - def __ge__(cls, other):
400 """a >= b -> bool. 401 402 a is greater or equal than b if the a site is longer than b site. 403 if their site have the same length sort by alphabetical order of their 404 names.""" 405 if not isinstance(other, RestrictionType): 406 raise NotImplementedError 407 if len(cls) > len(other): 408 return True 409 elif cls.size == len(other) and cls.__name__ >= other.__name__: 410 return True 411 else: 412 return False
413
414 - def __gt__(cls, other):
415 """a > b -> bool. 416 417 sorting order: 418 1. size of the recognition site. 419 2. if equal size, alphabetical order of the names.""" 420 if not isinstance(other, RestrictionType): 421 raise NotImplementedError 422 if len(cls) > len(other): 423 return True 424 elif cls.size == len(other) and cls.__name__ > other.__name__: 425 return True 426 else: 427 return False
428
429 - def __le__(cls, other):
430 """a <= b -> bool. 431 432 sorting order: 433 1. size of the recognition site. 434 2. if equal size, alphabetical order of the names. 435 """ 436 if not isinstance(other, RestrictionType): 437 raise NotImplementedError 438 elif len(cls) < len(other): 439 return True 440 elif len(cls) == len(other) and cls.__name__ <= other.__name__: 441 return True 442 else: 443 return False
444
445 - def __lt__(cls, other):
446 """a < b -> bool. 447 448 sorting order: 449 1. size of the recognition site. 450 2. if equal size, alphabetical order of the names. 451 """ 452 if not isinstance(other, RestrictionType): 453 raise NotImplementedError 454 elif len(cls) < len(other): 455 return True 456 elif len(cls) == len(other) and cls.__name__ < other.__name__: 457 return True 458 else: 459 return False
460
461 462 -class AbstractCut(RestrictionType):
463 """Implement the methods that are common to all restriction enzymes. 464 465 All the methods are classmethod. 466 467 For internal use only. Not meant to be instantiate. 468 """ 469 470 @classmethod
471 - def search(cls, dna, linear=True):
472 """RE.search(dna, linear=True) -> list. 473 474 return a list of all the site of RE in dna. Compensate for circular 475 sequences and so on. 476 477 dna must be a Bio.Seq.Seq instance or a Bio.Seq.MutableSeq instance. 478 479 if linear is False, the restriction sites than span over the boundaries 480 will be included. 481 482 The positions are the first base of the 3' fragment, 483 i.e. the first base after the position the enzyme will cut. 484 """ 485 # 486 # Separating search from _search allow a (very limited) optimisation 487 # of the search when using a batch of restriction enzymes. 488 # in this case the DNA is tested once by the class which implements 489 # the batch instead of being tested by each enzyme single. 490 # see RestrictionBatch.search() for example. 491 # 492 if isinstance(