Package Bio :: Package SCOP
[hide private]
[frames] | no frames]

Source Code for Package Bio.SCOP

  1  # Copyright 2001 by Gavin E. Crooks.  All rights reserved. 
  2  # Modifications Copyright 2004/2005 James Casbon. All rights Reserved. 
  3  # Modifications Copyright 2010 Jeffrey Finkelstein. All rights reserved. 
  4  # 
  5  # This code is part of the Biopython distribution and governed by its 
  6  # license.  Please see the LICENSE file that should have been included 
  7  # as part of this package. 
  8  # 
  9  # Changes made by James Casbon: 
 10  # - New Astral class 
 11  # - SQL functionality for both Scop and Astral classes 
 12  # - All sunids are int not strings 
 13  # 
 14  # Code written by Jeffrey Chang to access SCOP over the internet, which 
 15  # was previously in Bio.WWW.SCOP, has now been merged into this module. 
 16   
 17   
 18  """ SCOP: Structural Classification of Proteins. 
 19   
 20  The SCOP database aims to provide a manually constructed classification of 
 21  all know protein structures into a hierarchy, the main levels of which 
 22  are family, superfamily and fold. 
 23   
 24  * "SCOP":http://scop.mrc-lmb.cam.ac.uk/scop/ 
 25   
 26  * "Introduction":http://scop.mrc-lmb.cam.ac.uk/scop/intro.html 
 27   
 28  * "SCOP parsable files":http://scop.mrc-lmb.cam.ac.uk/scop/parse/ 
 29   
 30  The Scop object in this module represents the entire SCOP classification. It 
 31  can be built from the three SCOP parsable files, modified is so desired, and 
 32  converted back to the same file formats. A single SCOP domain (represented 
 33  by the Domain class) can be obtained from Scop using the domain's SCOP 
 34  identifier (sid). 
 35   
 36   
 37  nodeCodeDict  -- A mapping between known 2 letter node codes and a longer 
 38                    description. The known node types are 'cl' (class), 'cf' 
 39                    (fold), 'sf' (superfamily), 'fa' (family), 'dm' (domain), 
 40                    'sp' (species), 'px' (domain). Additional node types may 
 41                    be added in the future. 
 42   
 43  This module also provides code to access SCOP over the WWW. 
 44   
 45  Functions: 
 46  search        -- Access the main CGI script. 
 47  _open         -- Internally used function. 
 48   
 49  """ 
 50   
 51  from __future__ import print_function 
 52   
 53  import os 
 54  import re 
 55   
 56  from . import Des 
 57  from . import Cla 
 58  from . import Hie 
 59  from . import Residues 
 60  from Bio import SeqIO 
 61  from Bio.Seq import Seq 
 62   
 63   
 64  nodeCodeDict = {'cl': 'class', 'cf': 'fold', 'sf': 'superfamily', 
 65                  'fa': 'family', 'dm': 'protein', 'sp': 'species', 'px': 'domain'} 
 66   
 67   
 68  _nodetype_to_code = {'class': 'cl', 'fold': 'cf', 'superfamily': 'sf', 
 69                       'family': 'fa', 'protein': 'dm', 'species': 'sp', 'domain': 'px'} 
 70   
 71  nodeCodeOrder = ['ro', 'cl', 'cf', 'sf', 'fa', 'dm', 'sp', 'px'] 
 72   
 73  astralBibIds = [10, 20, 25, 30, 35, 40, 50, 70, 90, 95, 100] 
 74   
 75  astralEvs = [10, 5, 1, 0.5, 0.1, 0.05, 0.01, 0.005, 0.001, 1e-4, 1e-5, 1e-10, 1e-15, 
 76               1e-20, 1e-25, 1e-50] 
 77   
 78  astralEv_to_file = {10: 'e+1', 5: 'e+0,7', 1: 'e+0', 0.5: 'e-0,3', 0.1: 'e-1', 
 79                      0.05: 'e-1,3', 0.01: 'e-2', 0.005: 'e-2,3', 0.001: 'e-3', 
 80                      1e-4: 'e-4', 1e-5: 'e-5', 1e-10: 'e-10', 1e-15: 'e-15', 
 81                      1e-20: 'e-20', 1e-25: 'e-25', 1e-50: 'e-50'} 
 82   
 83  astralEv_to_sql = {10: 'e1', 5: 'e0_7', 1: 'e0', 0.5: 'e_0_3', 0.1: 'e_1', 
 84                     0.05: 'e_1_3', 0.01: 'e_2', 0.005: 'e_2_3', 0.001: 'e_3', 
 85                     1e-4: 'e_4', 1e-5: 'e_5', 1e-10: 'e_10', 1e-15: 'e_15', 
 86                     1e-20: 'e_20', 1e-25: 'e_25', 1e-50: 'e_50'} 
 87   
 88  try: 
 89      # See if the cmp function exists (will on Python 2) 
 90      _cmp = cmp 
 91  except NameError: 
92 - def _cmp(a, b):
93 """Implementation of cmp(x,y) for Python 3 (PRIVATE). 94 95 Based on Python 3 docs which say if you really need the cmp() 96 functionality, you could use the expression (a > b) - (a < b) 97 as the equivalent for cmp(a, b) 98 """ 99 return (a > b) - (a < b)
100 101
102 -def cmp_sccs(sccs1, sccs2):
103 """Order SCOP concise classification strings (sccs). 104 105 a.4.5.1 < a.4.5.11 < b.1.1.1 106 107 A sccs (e.g. a.4.5.11) compactly represents a domain's classification. 108 The letter represents the class, and the numbers are the fold, 109 superfamily, and family, respectively. 110 """ 111 s1 = sccs1.split(".") 112 s2 = sccs2.split(".") 113 114 if s1[0] != s2[0]: 115 return _cmp(s1[0], s2[0]) 116 117 s1 = [int(x) for x in s1[1:]] 118 s2 = [int(x) for x in s2[1:]] 119 120 return _cmp(s1, s2)
121 122 123 _domain_re = re.compile(r">?([\w_\.]*)\s+([\w\.]*)\s+\(([^)]*)\) (.*)") 124 125
126 -def parse_domain(str):
127 """Convert an ASTRAL header string into a Scop domain. 128 129 An ASTRAL (http://astral.stanford.edu/) header contains a concise 130 description of a SCOP domain. A very similar format is used when a 131 Domain object is converted into a string. The Domain returned by this 132 method contains most of the SCOP information, but it will not be located 133 within the SCOP hierarchy (i.e. The parent node will be None). The 134 description is composed of the SCOP protein and species descriptions. 135 136 A typical ASTRAL header looks like -- 137 >d1tpt_1 a.46.2.1 (1-70) Thymidine phosphorylase {Escherichia coli} 138 """ 139 m = _domain_re.match(str) 140 if (not m): 141 raise ValueError("Domain: " + str) 142 143 dom = Domain() 144 dom.sid = m.group(1) 145 dom.sccs = m.group(2) 146 dom.residues = Residues.Residues(m.group(3)) 147 if not dom.residues.pdbid: 148 dom.residues.pdbid = dom.sid[1:5] 149 dom.description = m.group(4).strip() 150 151 return dom
152 153
154 -def _open_scop_file(scop_dir_path, version, filetype):
155 filename = "dir.%s.scop.txt_%s" % (filetype, version) 156 handle = open(os.path.join(scop_dir_path, filename)) 157 return handle
158 159
160 -class Scop(object):
161 """The entire SCOP hierarchy. 162 163 root -- The root node of the hierarchy 164 """
165 - def __init__(self, cla_handle=None, des_handle=None, hie_handle=None, 166 dir_path=None, db_handle=None, version=None):
167 """Build the SCOP hierarchy from the SCOP parsable files, or a sql backend. 168 169 If no file handles are given, then a Scop object with a single 170 empty root node is returned. 171 172 If a directory and version are given (with dir_path=.., version=...) or 173 file handles for each file, the whole scop tree will be built in memory. 174 175 If a MySQLdb database handle is given, the tree will be built as needed, 176 minimising construction times. To build the SQL database to the methods 177 write_xxx_sql to create the tables. 178 179 """ 180 self._sidDict = {} 181 self._sunidDict = {} 182 183 if all(h is None for h in [cla_handle, des_handle, hie_handle, dir_path, db_handle]): 184 return 185 186 if dir_path is None and db_handle is None: 187 if cla_handle is None or des_handle is None or hie_handle is None: 188 raise RuntimeError("Need CLA, DES and HIE files to build SCOP") 189 190 sunidDict = {} 191 192 self.db_handle = db_handle 193 try: 194 if db_handle: 195 # do nothing if we have a db handle, we'll do it all on the fly 196 pass 197 else: 198 # open SCOP parseable files 199 if dir_path: 200 if not version: 201 raise RuntimeError("Need SCOP version to find parsable files in directory") 202 if cla_handle or des_handle or hie_handle: 203 raise RuntimeError("Cannot specify SCOP directory and specific files") 204 205 cla_handle = _open_scop_file(dir_path, version, 'cla') 206 des_handle = _open_scop_file(dir_path, version, 'des') 207 hie_handle = _open_scop_file(dir_path, version, 'hie') 208 209 root = Node() 210 domains = [] 211 root.sunid = 0 212 root.type = 'ro' 213 sunidDict[root.sunid] = root 214 self.root = root 215 root.description = 'SCOP Root' 216 217 # Build the rest of the nodes using the DES file 218 records = Des.parse(des_handle) 219 for record in records: 220 if record.nodetype == 'px': 221 n = Domain() 222 n.sid = record.name 223 domains.append(n) 224 else: 225 n = Node() 226 n.sunid = record.sunid 227 n.type = record.nodetype 228 n.sccs = record.sccs 229 n.description = record.description 230 231 sunidDict[n.sunid] = n 232 233 # Glue all of the Nodes together using the HIE file 234 records = Hie.parse(hie_handle) 235 for record in records: 236 if record.sunid not in sunidDict: 237 print(record.sunid) 238 239 n = sunidDict[record.sunid] 240 241 if record.parent != '': # Not root node 242 243 if record.parent not in sunidDict: 244 raise ValueError("Incomplete data?") 245 246 n.parent = sunidDict[record.parent] 247 248 for c in record.children: 249 if c not in sunidDict: 250 raise ValueError("Incomplete data?") 251 n.children.append(sunidDict[c]) 252 253 # Fill in the gaps with information from the CLA file 254 sidDict = {} 255 records = Cla.parse(cla_handle) 256 for record in records: 257 n = sunidDict[record.sunid] 258 assert n.sccs == record.sccs 259 assert n.sid == record.sid 260 n.residues = record.residues 261 sidDict[n.sid] = n 262 263 # Clean up 264 self._sunidDict = sunidDict 265 self._sidDict = sidDict 266 self._domains = tuple(domains) 267 268 finally: 269 if dir_path: 270 # If we opened the files, we close the files 271 if cla_handle: 272 cla_handle.close() 273 if des_handle: 274 des_handle.close() 275 if hie_handle: 276 hie_handle.close()
277
278 - def getRoot(self):
279 return self.getNodeBySunid(0)
280
281 - def getDomainBySid(self, sid):
282 """Return a domain from its sid""" 283 if sid in self._sidDict: 284 return self._sidDict[sid] 285 if self.db_handle: 286 self.getDomainFromSQL(sid=sid) 287 if sid in self._sidDict: 288 return self._sidDict[sid] 289 else: 290 return None
291
292 - def getNodeBySunid(self, sunid):
293 """Return a node from its sunid""" 294 if sunid in self._sunidDict: 295 return self._sunidDict[sunid] 296 if self.db_handle: 297 self.getDomainFromSQL(sunid=sunid) 298 if sunid in self._sunidDict: 299 return self._sunidDict[sunid] 300 else: 301 return None
302
303 - def getDomains(self):
304 """Returns an ordered tuple of all SCOP Domains""" 305 if self.db_handle: 306 return self.getRoot().getDescendents('px') 307 else: 308 return self._domains
309
310 - def write_hie(self, handle):
311 """Build an HIE SCOP parsable file from this object""" 312 # We order nodes to ease comparison with original file 313 for n in sorted(self._sunidDict.values(), key=lambda n: n.sunid): 314 handle.write(str(n.toHieRecord()))
315
316 - def write_des(self, handle):
317 """Build a DES SCOP parsable file from this object""" 318 # Original SCOP file is not ordered? 319 for n in sorted(self._sunidDict.values(), key=lambda n: n.sunid): 320 if n != self.root: 321 handle.write(str(n.toDesRecord()))
322
323 - def write_cla(self, handle):
324 """Build a CLA SCOP parsable file from this object""" 325 # We order nodes to ease comparison with original file 326 for n in sorted(self._sidDict.values(), key=lambda n: n.sunid): 327 handle.write(str(n.toClaRecord()))
328
329 - def getDomainFromSQL(self, sunid=None, sid=None):
330 """Load a node from the SQL backend using sunid or sid""" 331 if sunid is None and sid is None: 332 return None 333 334 cur = self.db_handle.cursor() 335 336 if sid: 337 cur.execute("SELECT sunid FROM cla WHERE sid=%s", sid) 338 res = cur.fetchone() 339 if res is None: 340 return None 341 sunid = res[0] 342 343 cur.execute("SELECT * FROM des WHERE sunid=%s", sunid) 344 data = cur.fetchone() 345 346 if data is not None: 347 n = None 348 349 # determine if Node or Domain 350 if data[1] != "px": 351 n = Node(scop=self) 352 353 cur.execute("SELECT child FROM hie WHERE parent=%s", sunid) 354 children = [] 355 for c in cur.fetchall(): 356 children.append(c[0]) 357 n.children = children 358 else: 359 n = Domain(scop=self) 360 cur.execute("select sid, residues, pdbid from cla where sunid=%s", 361 sunid) 362 363 [n.sid, n.residues, pdbid] = cur.fetchone() 364 n.residues = Residues.Residues(n.residues) 365 n.residues.pdbid = pdbid 366 self._sidDict[n.sid] = n 367 368 [n.sunid, n.type, n.sccs, n.description] = data 369 370 if data[1] != 'ro': 371 cur.execute("SELECT parent FROM hie WHERE child=%s", sunid) 372 n.parent = cur.fetchone()[0] 373 374 n.sunid = int(n.sunid) 375 376 self._sunidDict[n.sunid] = n
377
378 - def getAscendentFromSQL(self, node, type):
379 """Get ascendents using SQL backend""" 380 if nodeCodeOrder.index(type) >= nodeCodeOrder.index(node.type): 381 return None 382 383 cur = self.db_handle.cursor() 384 cur.execute("SELECT " + type + " from cla WHERE " + node.type + "=%s", (node.sunid)) 385 result = cur.fetchone() 386 if result is not None: 387 return self.getNodeBySunid(result[0]) 388 else: 389 return None
390
391 - def getDescendentsFromSQL(self, node, type):
392 """Get descendents of a node using the database backend. This avoids 393 repeated iteration of SQL calls and is therefore much quicker than 394 repeatedly calling node.getChildren(). 395 """ 396 if nodeCodeOrder.index(type) <= nodeCodeOrder.index(node.type): 397 return [] 398 399 des_list = [] 400 401 # SQL cla table knows nothing about 'ro' 402 if node.type == 'ro': 403 for c in node.getChildren(): 404 for d in self.getDescendentsFromSQL(c, type): 405 des_list.append(d) 406 return des_list 407 408 cur = self.db_handle.cursor() 409 410 if type != 'px': 411 cur.execute("SELECT DISTINCT des.sunid,des.type,des.sccs,description FROM \ 412 cla,des WHERE cla." + node.type + "=%s AND cla." + type + "=des.sunid", (node.sunid)) 413 data = cur.fetchall() 414 for d in data: 415 if int(d[0]) not in self._sunidDict: 416 n = Node(scop=self) 417 [n.sunid, n.type, n.sccs, n.description] = d 418 n.sunid = int(n.sunid) 419 self._sunidDict[n.sunid] = n 420 421 cur.execute("SELECT parent FROM hie WHERE child=%s", n.sunid) 422 n.parent = cur.fetchone()[0] 423 424 cur.execute("SELECT child FROM hie WHERE parent=%s", n.sunid) 425 children = [] 426 for c in cur.fetchall(): 427 children.append(c[0]) 428 n.children = children 429 430 des_list.append(self._sunidDict[int(d[0])]) 431 432 else: 433 cur.execute("SELECT cla.sunid,sid,pdbid,residues,cla.sccs,type,description,sp\ 434 FROM cla,des where cla.sunid=des.sunid and cla." + node.type + "=%s", 435 node.sunid) 436 437 data = cur.fetchall() 438 for d in data: 439 if int(d[0]) not in self._sunidDict: 440 n = Domain(scop=self) 441 # [n.sunid, n.sid, n.pdbid, n.residues, n.sccs, n.type, 442 # n.description,n.parent] = data 443 [n.sunid, n.sid, pdbid, n.residues, n.sccs, n.type, n.description, 444 n.parent] = d[0:8] 445 n.residues = Residues.Residues(n.residues) 446 n.residues.pdbid = pdbid 447 n.sunid = int(n.sunid) 448 self._sunidDict[n.sunid] = n 449 self._sidDict[n.sid] = n 450 451 des_list.append(self._sunidDict[int(d[0])]) 452 453 return des_list
454
455 - def write_hie_sql(self, handle):
456 """Write HIE data to SQL database""" 457 cur = handle.cursor() 458 459 cur.execute("DROP TABLE IF EXISTS hie") 460 cur.execute("CREATE TABLE hie (parent INT, child INT, PRIMARY KEY (child),\ 461 INDEX (parent) )") 462 463 for p in self._sunidDict.values(): 464 for c in p.children: 465 cur.execute("INSERT INTO hie VALUES (%s,%s)" % (p.sunid, c.sunid))
466
467 - def write_cla_sql(self, handle):
468 """Write CLA data to SQL database""" 469 cur = handle.cursor() 470 471 cur.execute("DROP TABLE IF EXISTS cla") 472 cur.execute("CREATE TABLE cla (sunid INT, sid CHAR(8), pdbid CHAR(4),\ 473 residues VARCHAR(50), sccs CHAR(10), cl INT, cf INT, sf INT, fa INT,\ 474 dm INT, sp INT, px INT, PRIMARY KEY (sunid), INDEX (SID) )") 475 476 for n in self._sidDict.values(): 477 c = n.toClaRecord() 478 cur.execute("INSERT INTO cla VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)", 479 (n.sunid, n.sid, c.residues.pdbid, c.residues, n.sccs, 480 n.getAscendent('cl').sunid, n.getAscendent('cf').sunid, 481 n.getAscendent('sf').sunid, n.getAscendent('fa').sunid, 482 n.getAscendent('dm').sunid, n.getAscendent('sp').sunid, 483 n.sunid))
484
485 - def write_des_sql(self, handle):
486 """Write DES data to SQL database""" 487 cur = handle.cursor() 488 489 cur.execute("DROP TABLE IF EXISTS des") 490 cur.execute("CREATE TABLE des (sunid INT, type CHAR(2), sccs CHAR(10),\ 491 description VARCHAR(255),\ 492 PRIMARY KEY (sunid) )") 493 494 for n in self._sunidDict.values(): 495 cur.execute("INSERT INTO des VALUES (%s,%s,%s,%s)", 496 (n.sunid, n.type, n.sccs, n.description))
497 498
499 -class Node(object):
500 """ A node in the Scop hierarchy 501 502 sunid -- SCOP unique identifiers. e.g. '14986' 503 504 parent -- The parent node 505 506 children -- A list of child nodes 507 508 sccs -- SCOP concise classification string. e.g. 'a.1.1.2' 509 510 type -- A 2 letter node type code. e.g. 'px' for domains 511 512 description -- 513 514 """
515 - def __init__(self, scop=None):
516 """Create a Node in the scop hierarchy. If a Scop instance is provided to the 517 constructor, this will be used to lookup related references using the SQL 518 methods. If no instance is provided, it is assumed the whole tree exists 519 and is connected.""" 520 self.sunid = '' 521 self.parent = None 522 self.children = [] 523 self.sccs = '' 524 self.type = '' 525 self.description = '' 526 self.scop = scop
527
528 - def __str__(self):
529 s = [] 530 s.append(str(self.sunid)) 531 s.append(self.sccs) 532 s.append(self.type) 533 s.append(self.description) 534 535 return " ".join(s)
536
537 - def toHieRecord(self):
538 """Return an Hie.Record""" 539 rec = Hie.Record() 540 rec.sunid = str(self.sunid) 541 if self.getParent(): # Not root node 542 rec.parent = str(self.getParent().sunid) 543 else: 544 rec.parent = '-' 545 for c in self.getChildren(): 546 rec.children.append(str(c.sunid)) 547 return rec
548
549 - def toDesRecord(self):
550 """Return a Des.Record""" 551 rec = Des.Record() 552 rec.sunid = str(self.sunid) 553 rec.nodetype = self.type 554 rec.sccs = self.sccs 555 rec.description = self.description 556 return rec
557
558 - def getChildren(self):
559 """Return a list of children of this Node""" 560 if self.scop is None: 561 return self.children 562 else: 563 return [self.scop.getNodeBySunid(x) for x in self.children]
564
565 - def getParent(self):
566 """Return the parent of this Node""" 567 if self.scop is None: 568 return self.parent 569 else: 570 return self.scop.getNodeBySunid(self.parent)
571
572 - def getDescendents(self, node_type):
573 """ Return a list of all descendant nodes of the given type. Node type can a 574 two letter code or longer description. e.g. 'fa' or 'family' 575 """ 576 if node_type in _nodetype_to_code: 577 node_type = _nodetype_to_code[node_type] 578 579 nodes = [self] 580 if self.scop: 581 return self.scop.getDescendentsFromSQL(self, node_type) 582 while nodes[0].type != node_type: 583 if nodes[0].type == 'px': 584 return [] # Fell of the bottom of the hierarchy 585 child_list = [] 586 for n in nodes: 587 for child in n.getChildren(): 588 child_list.append(child) 589 nodes = child_list 590 591 return nodes
592
593 - def getAscendent(self, node_type):
594 """ Return the ancenstor node of the given type, or None.Node type can a 595 two letter code or longer description. e.g. 'fa' or 'family'""" 596 if node_type in _nodetype_to_code: 597 node_type = _nodetype_to_code[node_type] 598 599 if self.scop: 600 return self.scop.getAscendentFromSQL(self, node_type) 601 else: 602 n = self 603 if n.type == node_type: 604 return None 605 606 while n.type != node_type: 607 if n.type == 'ro': 608 return None # Fell of the top of the hierarchy 609 n = n.getParent() 610 611 return n
612 613
614 -class Domain(Node):
615 """ A SCOP domain. A leaf node in the Scop hierarchy. 616 617 - sid -- The SCOP domain identifier. e.g. ``"d5hbib_"`` 618 619 - residues -- A Residue object. It defines the collection 620 of PDB atoms that make up this domain. 621 """
622 - def __init__(self, scop=None):
623 Node.__init__(self, scop=scop) 624 self.sid = '' 625 self.residues = None
626
627 - def __str__(self):
628 s = [] 629 s.append(self.sid) 630 s.append(self.sccs) 631 s.append("(" + str(self.residues) + ")") 632 633 if not self.getParent(): 634 s.append(self.description) 635 else: 636 sp = self.getParent() 637 dm = sp.getParent() 638 s.append(dm.description) 639 s.append("{" + sp.description + "}") 640 641 return " ".join(s)
642
643 - def toDesRecord(self):
644 """Return a Des.Record""" 645 rec = Node.toDesRecord(self) 646 rec.name = self.sid 647 return rec
648
649 - def toClaRecord(self):
650 """Return a Cla.Record""" 651 rec = Cla.Record() 652 rec.sid = self.sid 653 rec.residues = self.residues 654 rec.sccs = self.sccs 655 rec.sunid = self.sunid 656 657 n = self 658 while n.sunid != 0: # Not root node 659 rec.hierarchy[n.type] = str(n.sunid) 660 n = n.getParent() 661 662 # Order does not matter in the hierarchy field. For more info, see 663 # http://scop.mrc-lmb.cam.ac.uk/scop/release-notes.html 664 # rec.hierarchy.reverse() 665 666 return rec
667 668
669 -class Astral(object):
670 """Abstraction of the ASTRAL database, which has sequences for all the SCOP domains, 671 as well as clusterings by percent id or evalue. 672 """ 673
674 - def __init__(self, dir_path=None, version=None, scop=None, 675 astral_file=None, db_handle=None):
676 """Initialise the astral database. 677 678 You must provide either a directory of SCOP files: 679 680 - dir_path - string, the path to location of the scopseq-x.xx directory 681 (not the directory itself), and 682 - version -a version number. 683 684 or, a FASTA file: 685 686 - astral_file - string, a path to a fasta file (which will be loaded in memory) 687 688 or, a MYSQL database: 689 690 - db_handle - a database handle for a MYSQL database containing a table 691 'astral' with the astral data in it. This can be created 692 using writeToSQL. 693 """ 694 if astral_file is None and dir_path is None and db_handle is None: 695 raise RuntimeError("Need either file handle, or (dir_path + version)," 696 " or database handle to construct Astral") 697 if not scop: 698 raise RuntimeError("Must provide a Scop instance to construct") 699 700 self.scop = scop 701 self.db_handle = db_handle 702 703 if not astral_file and not db_handle: 704 if dir_path is None or version is None: 705 raise RuntimeError("must provide dir_path and version") 706 707 self.version = version 708 self.path = os.path.join(dir_path, "scopseq-%s" % version) 709 astral_file = "astral-scopdom-seqres-all-%s.fa" % self.version 710 astral_file = os.path.join(self.path, astral_file) 711 712 if astral_file: 713 # Build a dictionary of SeqRecord objects in the FASTA file, IN MEMORY 714 self.fasta_dict = SeqIO.to_dict(SeqIO.parse(astral_file, "fasta")) 715 716 self.astral_file = astral_file 717 self.EvDatasets = {} 718 self.EvDatahash = {} 719 self.IdDatasets = {} 720 self.IdDatahash = {}
721
722 - def domainsClusteredByEv(self, id):
723 """get domains clustered by evalue""" 724 if id not in self.EvDatasets: 725 if self.db_handle: 726 self.EvDatasets[id] = self.getAstralDomainsFromSQL(astralEv_to_sql[id]) 727 728 else: 729 if not self.path: 730 raise RuntimeError("No scopseq directory specified") 731 732 file_prefix = "astral-scopdom-seqres-sel-gs" 733 filename = "%s-e100m-%s-%s.id" % (file_prefix, astralEv_to_file[id], 734 self.version) 735 filename = os.path.join(self.path, filename) 736 self.EvDatasets[id] = self.getAstralDomainsFromFile(filename) 737 return self.EvDatasets[id]
738
739 - def domainsClusteredById(self, id):
740 """get domains clustered by percent id""" 741 if id not in self.IdDatasets: 742 if self.db_handle: 743 self.IdDatasets[id] = self.getAstralDomainsFromSQL("id" + str(id)) 744 else: 745 if not self.path: 746 raise RuntimeError("No scopseq directory specified") 747 748 file_prefix = "astral-scopdom-seqres-sel-gs" 749 filename = "%s-bib-%s-%s.id" % (file_prefix, id, self.version) 750 filename = os.path.join(self.path, filename) 751 self.IdDatasets[id] = self.getAstralDomainsFromFile(filename) 752 return self.IdDatasets[id]
753
754 - def getAstralDomainsFromFile(self, filename=None, file_handle=None):
755 """Get the scop domains from a file containing a list of sids""" 756 if file_handle is None and filename is None: 757 raise RuntimeError("You must provide a filename or handle") 758 if not file_handle: 759 file_handle = open(filename) 760 doms = [] 761 while True: 762 line = file_handle.readline() 763 if not line: 764 break 765 line = line.rstrip() 766 doms.append(line) 767 if filename: 768 file_handle.close() 769 770 doms = [a for a in doms if a[0] == 'd'] 771 doms = [self.scop.getDomainBySid(x) for x in doms] 772 return doms
773
774 - def getAstralDomainsFromSQL(self, column):
775 """Load a set of astral domains from a column in the astral table of a MYSQL 776 database (which can be created with writeToSQL(...)""" 777 cur = self.db_handle.cursor() 778 cur.execute("SELECT sid FROM astral WHERE " + column + "=1") 779 data = cur.fetchall() 780 data = [self.scop.getDomainBySid(x[0]) for x in data] 781 782 return data
783
784 - def getSeqBySid(self, domain):
785 """get the seq record of a given domain from its sid""" 786 if self.db_handle is None: 787 return self.fasta_dict[domain].seq 788 else: 789 cur = self.db_handle.cursor() 790 cur.execute("SELECT seq FROM astral WHERE sid=%s", domain) 791 return Seq(cur.fetchone()[0])
792
793 - def getSeq(self, domain):
794 """Return seq associated with domain""" 795 return self.getSeqBySid(domain.sid)
796
797 - def hashedDomainsById(self, id):
798 """Get domains clustered by sequence identity in a dict""" 799 if id not in self.IdDatahash: 800 self.IdDatahash[id] = {} 801 for d in self.domainsClusteredById(id): 802 self.IdDatahash[id][d] = 1 803 return self.IdDatahash[id]
804
805 - def hashedDomainsByEv(self, id):
806 """Get domains clustered by evalue in a dict""" 807 if id not in self.EvDatahash: 808 self.EvDatahash[id] = {} 809 for d in self.domainsClusteredByEv(id): 810 self.EvDatahash[id][d] = 1 811 return self.EvDatahash[id]
812
813 - def isDomainInId(self, dom, id):
814 """Returns true if the domain is in the astral clusters for percent ID""" 815 return dom in self.hashedDomainsById(id)
816
817 - def isDomainInEv(self, dom, id):
818 """Returns true if the domain is in the ASTRAL clusters for evalues""" 819 return dom in self.hashedDomainsByEv(id)
820
821 - def writeToSQL(self, db_handle):
822 """Write the ASTRAL database to a MYSQL database""" 823 cur = db_handle.cursor() 824 825 cur.execute("DROP TABLE IF EXISTS astral") 826 cur.execute("CREATE TABLE astral (sid CHAR(8), seq TEXT, PRIMARY KEY (sid))") 827 828 for dom in self.fasta_dict: 829 cur.execute("INSERT INTO astral (sid,seq) values (%s,%s)", 830 (dom, self.fasta_dict[dom].seq.data)) 831 832 for i in astralBibIds: 833 cur.execute("ALTER TABLE astral ADD (id" + str(i) + " TINYINT)") 834 835 for d in self.domainsClusteredById(i): 836 cur.execute("UPDATE astral SET id" + str(i) + "=1 WHERE sid=%s", 837 d.sid) 838 839 for ev in astralEvs: 840 cur.execute("ALTER TABLE astral ADD (" + astralEv_to_sql[ev] + " TINYINT)") 841 842 for d in self.domainsClusteredByEv(ev): 843 cur.execute("UPDATE astral SET " + astralEv_to_sql[ev] + "=1 WHERE sid=%s", 844 d.sid)
845 846
847 -def search(pdb=None, key=None, sid=None, disp=None, dir=None, loc=None, 848 cgi='http://scop.mrc-lmb.cam.ac.uk/scop/search.cgi', **keywds):
849 """search(pdb=None, key=None, sid=None, disp=None, dir=None, loc=None, 850 cgi='http://scop.mrc-lmb.cam.ac.uk/scop/search.cgi', **keywds) 851 852 Access search.cgi and return a handle to the results. See the 853 online help file for an explanation of the parameters: 854 http://scop.mrc-lmb.cam.ac.uk/scop/help.html 855 856 Raises an IOError if there's a network error. 857 858 """ 859 params = {'pdb': pdb, 'key': key, 'sid': sid, 'disp': disp, 860 'dir': dir, 'loc': loc} 861 variables = {} 862 for k, v in params.items(): 863 if v is not None: 864 variables[k] = v 865 variables.update(keywds) 866 return _open(cgi, variables)
867 868
869 -def _open(cgi, params=None, get=1):
870 """Open a hnadle to SCOP, returns an UndoHandle 871 872 Open a handle to SCOP. cgi is the URL for the cgi script to access. 873 params is a dictionary with the options to pass to it. get is a boolean 874 that describes whether a GET should be used. Does some 875 simple error checking, and will raise an IOError if it encounters one. 876 877 """ 878 from Bio._py3k import urlopen, urlencode 879 880 # Open a handle to SCOP. 881 if params is None: 882 params = {} 883 options = urlencode(params) 884 if get: # do a GET 885 if options: 886 cgi += "?" + options 887 handle = urlopen(cgi) 888 else: # do a POST 889 handle = urlopen(cgi, data=options) 890 return handle
891