Package Bio :: Package SCOP
[hide private]
[frames] | no frames]

Source Code for Package Bio.SCOP

  1  # Copyright 2001 by Gavin E. Crooks.  All rights reserved. 
  2  # Modifications Copyright 2004/2005 James Casbon. All rights Reserved. 
  3  # Modifications Copyright 2010 Jeffrey Finkelstein. All rights reserved. 
  4  # 
  5  # This code is part of the Biopython distribution and governed by its 
  6  # license.  Please see the LICENSE file that should have been included 
  7  # as part of this package. 
  8  # 
  9  # Changes made by James Casbon: 
 10  # - New Astral class 
 11  # - SQL functionality for both Scop and Astral classes 
 12  # - All sunids are int not strings 
 13  # 
 14  # Code written by Jeffrey Chang to access SCOP over the internet, which 
 15  # was previously in Bio.WWW.SCOP, has now been merged into this module. 
 16   
 17   
 18  """SCOP: Structural Classification of Proteins. 
 19   
 20  The SCOP database aims to provide a manually constructed classification of 
 21  all know protein structures into a hierarchy, the main levels of which 
 22  are family, superfamily and fold. 
 23   
 24  * "SCOP":http://scop.mrc-lmb.cam.ac.uk/scop/ 
 25  * "Introduction":http://scop.mrc-lmb.cam.ac.uk/scop/intro.html 
 26  * "SCOP parsable files":http://scop.mrc-lmb.cam.ac.uk/scop/parse/ 
 27   
 28  The Scop object in this module represents the entire SCOP classification. It 
 29  can be built from the three SCOP parsable files, modified is so desired, and 
 30  converted back to the same file formats. A single SCOP domain (represented 
 31  by the Domain class) can be obtained from Scop using the domain's SCOP 
 32  identifier (sid). 
 33   
 34  - nodeCodeDict -- A mapping between known 2 letter node codes and a longer 
 35                    description. The known node types are 'cl' (class), 'cf' 
 36                    (fold), 'sf' (superfamily), 'fa' (family), 'dm' (domain), 
 37                    'sp' (species), 'px' (domain). Additional node types may 
 38                    be added in the future. 
 39   
 40  This module also provides code to access SCOP over the WWW. 
 41   
 42  Functions: 
 43   - search        -- Access the main CGI script. 
 44   - _open         -- Internally used function. 
 45   
 46  """ 
 47   
 48  from __future__ import print_function 
 49   
 50  import os 
 51  import re 
 52   
 53  from . import Des 
 54  from . import Cla 
 55  from . import Hie 
 56  from . import Residues 
 57  from Bio import SeqIO 
 58  from Bio.Seq import Seq 
 59   
 60   
 61  nodeCodeDict = {'cl': 'class', 'cf': 'fold', 'sf': 'superfamily', 
 62                  'fa': 'family', 'dm': 'protein', 'sp': 'species', 'px': 'domain'} 
 63   
 64   
 65  _nodetype_to_code = {'class': 'cl', 'fold': 'cf', 'superfamily': 'sf', 
 66                       'family': 'fa', 'protein': 'dm', 'species': 'sp', 'domain': 'px'} 
 67   
 68  nodeCodeOrder = ['ro', 'cl', 'cf', 'sf', 'fa', 'dm', 'sp', 'px'] 
 69   
 70  astralBibIds = [10, 20, 25, 30, 35, 40, 50, 70, 90, 95, 100] 
 71   
 72  astralEvs = [10, 5, 1, 0.5, 0.1, 0.05, 0.01, 0.005, 0.001, 1e-4, 1e-5, 1e-10, 1e-15, 
 73               1e-20, 1e-25, 1e-50] 
 74   
 75  astralEv_to_file = {10: 'e+1', 5: 'e+0,7', 1: 'e+0', 0.5: 'e-0,3', 0.1: 'e-1', 
 76                      0.05: 'e-1,3', 0.01: 'e-2', 0.005: 'e-2,3', 0.001: 'e-3', 
 77                      1e-4: 'e-4', 1e-5: 'e-5', 1e-10: 'e-10', 1e-15: 'e-15', 
 78                      1e-20: 'e-20', 1e-25: 'e-25', 1e-50: 'e-50'} 
 79   
 80  astralEv_to_sql = {10: 'e1', 5: 'e0_7', 1: 'e0', 0.5: 'e_0_3', 0.1: 'e_1', 
 81                     0.05: 'e_1_3', 0.01: 'e_2', 0.005: 'e_2_3', 0.001: 'e_3', 
 82                     1e-4: 'e_4', 1e-5: 'e_5', 1e-10: 'e_10', 1e-15: 'e_15', 
 83                     1e-20: 'e_20', 1e-25: 'e_25', 1e-50: 'e_50'} 
 84   
 85  try: 
 86      # See if the cmp function exists (will on Python 2) 
 87      _cmp = cmp 
 88  except NameError: 
89 - def _cmp(a, b):
90 """Implement cmp(x,y) for Python 3 (PRIVATE). 91 92 Based on Python 3 docs which say if you really need the cmp() 93 functionality, you could use the expression (a > b) - (a < b) 94 as the equivalent for cmp(a, b) 95 """ 96 return (a > b) - (a < b)
97 98
99 -def cmp_sccs(sccs1, sccs2):
100 """Order SCOP concise classification strings (sccs). 101 102 a.4.5.1 < a.4.5.11 < b.1.1.1 103 104 A sccs (e.g. a.4.5.11) compactly represents a domain's classification. 105 The letter represents the class, and the numbers are the fold, 106 superfamily, and family, respectively. 107 """ 108 s1 = sccs1.split(".") 109 s2 = sccs2.split(".") 110 111 if s1[0] != s2[0]: 112 return _cmp(s1[0], s2[0]) 113 114 s1 = [int(x) for x in s1[1:]] 115 s2 = [int(x) for x in s2[1:]] 116 117 return _cmp(s1, s2)
118 119 120 _domain_re = re.compile(r">?([\w_\.]*)\s+([\w\.]*)\s+\(([^)]*)\) (.*)") 121 122
123 -def parse_domain(str):
124 """Convert an ASTRAL header string into a Scop domain. 125 126 An ASTRAL (http://astral.stanford.edu/) header contains a concise 127 description of a SCOP domain. A very similar format is used when a 128 Domain object is converted into a string. The Domain returned by this 129 method contains most of the SCOP information, but it will not be located 130 within the SCOP hierarchy (i.e. The parent node will be None). The 131 description is composed of the SCOP protein and species descriptions. 132 133 A typical ASTRAL header looks like -- 134 >d1tpt_1 a.46.2.1 (1-70) Thymidine phosphorylase {Escherichia coli} 135 """ 136 m = _domain_re.match(str) 137 if (not m): 138 raise ValueError("Domain: " + str) 139 140 dom = Domain() 141 dom.sid = m.group(1) 142 dom.sccs = m.group(2) 143 dom.residues = Residues.Residues(m.group(3)) 144 if not dom.residues.pdbid: 145 dom.residues.pdbid = dom.sid[1:5] 146 dom.description = m.group(4).strip() 147 148 return dom
149 150
151 -def _open_scop_file(scop_dir_path, version, filetype):
152 filename = "dir.%s.scop.txt_%s" % (filetype, version) 153 handle = open(os.path.join(scop_dir_path, filename)) 154 return handle
155 156
157 -class Scop(object):
158 """The entire SCOP hierarchy. 159 160 root -- The root node of the hierarchy 161 """ 162
163 - def __init__(self, cla_handle=None, des_handle=None, hie_handle=None, 164 dir_path=None, db_handle=None, version=None):
165 """Build the SCOP hierarchy from the SCOP parsable files, or a sql backend. 166 167 If no file handles are given, then a Scop object with a single 168 empty root node is returned. 169 170 If a directory and version are given (with dir_path=.., version=...) or 171 file handles for each file, the whole scop tree will be built in memory. 172 173 If a MySQLdb database handle is given, the tree will be built as needed, 174 minimising construction times. To build the SQL database to the methods 175 write_xxx_sql to create the tables. 176 177 """ 178 self._sidDict = {} 179 self._sunidDict = {} 180 181 if all(h is None for h in [cla_handle, des_handle, hie_handle, dir_path, db_handle]): 182 return 183 184 if dir_path is None and db_handle is None: 185 if cla_handle is None or des_handle is None or hie_handle is None: 186 raise RuntimeError("Need CLA, DES and HIE files to build SCOP") 187 188 sunidDict = {} 189 190 self.db_handle = db_handle 191 try: 192 if db_handle: 193 # do nothing if we have a db handle, we'll do it all on the fly 194 pass 195 else: 196 # open SCOP parseable files 197 if dir_path: 198 if not version: 199 raise RuntimeError("Need SCOP version to find parsable files in directory") 200 if cla_handle or des_handle or hie_handle: 201 raise RuntimeError("Cannot specify SCOP directory and specific files") 202 203 cla_handle = _open_scop_file(dir_path, version, 'cla') 204 des_handle = _open_scop_file(dir_path, version, 'des') 205 hie_handle = _open_scop_file(dir_path, version, 'hie') 206 207 root = Node() 208 domains = [] 209 root.sunid = 0 210 root.type = 'ro' 211 sunidDict[root.sunid] = root 212 self.root = root 213 root.description = 'SCOP Root' 214 215 # Build the rest of the nodes using the DES file 216 records = Des.parse(des_handle) 217 for record in records: 218 if record.nodetype == 'px': 219 n = Domain() 220 n.sid = record.name 221 domains.append(n) 222 else: 223 n = Node() 224 n.sunid = record.sunid 225 n.type = record.nodetype 226 n.sccs = record.sccs 227 n.description = record.description 228 229 sunidDict[n.sunid] = n 230 231 # Glue all of the Nodes together using the HIE file 232 records = Hie.parse(hie_handle) 233 for record in records: 234 if record.sunid not in sunidDict: 235 print(record.sunid) 236 237 n = sunidDict[record.sunid] 238 239 if record.parent != '': # Not root node 240 241 if record.parent not in sunidDict: 242 raise ValueError("Incomplete data?") 243 244 n.parent = sunidDict[record.parent] 245 246 for c in record.children: 247 if c not in sunidDict: 248 raise ValueError("Incomplete data?") 249 n.children.append(sunidDict[c]) 250 251 # Fill in the gaps with information from the CLA file 252 sidDict = {} 253 records = Cla.parse(cla_handle) 254 for record in records: 255 n = sunidDict[record.sunid] 256 assert n.sccs == record.sccs 257 assert n.sid == record.sid 258 n.residues = record.residues 259 sidDict[n.sid] = n 260 261 # Clean up 262 self._sunidDict = sunidDict 263 self._sidDict = sidDict 264 self._domains = tuple(domains) 265 266 finally: 267 if dir_path: 268 # If we opened the files, we close the files 269 if cla_handle: 270 cla_handle.close() 271 if des_handle: 272 des_handle.close() 273 if hie_handle: 274 hie_handle.close()
275
276 - def getRoot(self):
277 """Get root node.""" 278 return self.getNodeBySunid(0)
279
280 - def getDomainBySid(self, sid):
281 """Return a domain from its sid.""" 282 if sid in self._sidDict: 283 return self._sidDict[sid] 284 if self.db_handle: 285 self.getDomainFromSQL(sid=sid) 286 if sid in self._sidDict: 287 return self._sidDict[sid] 288 else: 289 return None
290
291 - def getNodeBySunid(self, sunid):
292 """Return a node from its sunid.""" 293 if sunid in self._sunidDict: 294 return self._sunidDict[sunid] 295 if self.db_handle: 296 self.getDomainFromSQL(sunid=sunid) 297 if sunid in self._sunidDict: 298 return self._sunidDict[sunid] 299 else: 300 return None
301
302 - def getDomains(self):
303 """Return an ordered tuple of all SCOP Domains.""" 304 if self.db_handle: 305 return self.getRoot().getDescendents('px') 306 else: 307 return self._domains
308
309 - def write_hie(self, handle):
310 """Build an HIE SCOP parsable file from this object.""" 311 # We order nodes to ease comparison with original file 312 for n in sorted(self._sunidDict.values(), key=lambda n: n.sunid): 313 handle.write(str(n.toHieRecord()))
314
315 - def write_des(self, handle):
316 """Build a DES SCOP parsable file from this object.""" 317 # Original SCOP file is not ordered? 318 for n in sorted(self._sunidDict.values(), key=lambda n: n.sunid): 319 if n != self.root: 320 handle.write(str(n.toDesRecord()))
321
322 - def write_cla(self, handle):
323 """Build a CLA SCOP parsable file from this object.""" 324 # We order nodes to ease comparison with original file 325 for n in sorted(self._sidDict.values(), key=lambda n: n.sunid): 326 handle.write(str(n.toClaRecord()))
327
328 - def getDomainFromSQL(self, sunid=None, sid=None):
329 """Load a node from the SQL backend using sunid or sid.""" 330 if sunid is None and sid is None: 331 return None 332 333 cur = self.db_handle.cursor() 334 335 if sid: 336 cur.execute("SELECT sunid FROM cla WHERE sid=%s", sid) 337 res = cur.fetchone() 338 if res is None: 339 return None 340 sunid = res[0] 341 342 cur.execute("SELECT * FROM des WHERE sunid=%s", sunid) 343 data = cur.fetchone() 344 345 if data is not None: 346 n = None 347 348 # determine if Node or Domain 349 if data[1] != "px": 350 n = Node(scop=self) 351 352 cur.execute("SELECT child FROM hie WHERE parent=%s", sunid) 353 children = [] 354 for c in cur.fetchall(): 355 children.append(c[0]) 356 n.children = children 357 else: 358 n = Domain(scop=self) 359 cur.execute("select sid, residues, pdbid from cla where sunid=%s", 360 sunid) 361 362 n.sid, n.residues, pdbid = cur.fetchone() 363 n.residues = Residues.Residues(n.residues) 364 n.residues.pdbid = pdbid 365 self._sidDict[n.sid] = n 366 367 n.sunid, n.type, n.sccs, n.description = data 368 369 if data[1] != 'ro': 370 cur.execute("SELECT parent FROM hie WHERE child=%s", sunid) 371 n.parent = cur.fetchone()[0] 372 373 n.sunid = int(n.sunid) 374 375 self._sunidDict[n.sunid] = n
376
377 - def getAscendentFromSQL(self, node, type):
378 """Get ascendents using SQL backend.""" 379 if nodeCodeOrder.index(type) >= nodeCodeOrder.index(node.type): 380 return None 381 382 cur = self.db_handle.cursor() 383 cur.execute("SELECT " + type + " from cla WHERE " + node.type + "=%s", (node.sunid)) 384 result = cur.fetchone() 385 if result is not None: 386 return self.getNodeBySunid(result[0]) 387 else: 388 return None
389
390 - def getDescendentsFromSQL(self, node, type):
391 """Get descendents of a node using the database backend. 392 393 This avoids repeated iteration of SQL calls and is therefore much 394 quicker than repeatedly calling node.getChildren(). 395 """ 396 if nodeCodeOrder.index(type) <= nodeCodeOrder.index(node.type): 397 return [] 398 399 des_list = [] 400 401 # SQL cla table knows nothing about 'ro' 402 if node.type == 'ro': 403 for c in node.getChildren(): 404 for d in self.getDescendentsFromSQL(c, type): 405 des_list.append(d) 406 return des_list 407 408 cur = self.db_handle.cursor() 409 410 if type != 'px': 411 cur.execute("SELECT DISTINCT des.sunid,des.type,des.sccs,description FROM \ 412 cla,des WHERE cla." + node.type + "=%s AND cla." + type + "=des.sunid", (node.sunid)) 413 data = cur.fetchall() 414 for d in data: 415 if int(d[0]) not in self._sunidDict: 416 n = Node(scop=self) 417 n.sunid, n.type, n.sccs, n.description = d 418 n.sunid = int(n.sunid) 419 self._sunidDict[n.sunid] = n 420 421 cur.execute("SELECT parent FROM hie WHERE child=%s", n.sunid) 422 n.parent = cur.fetchone()[0] 423 424 cur.execute("SELECT child FROM hie WHERE parent=%s", n.sunid) 425 children = [] 426 for c in cur.fetchall(): 427 children.append(c[0]) 428 n.children = children 429 430 des_list.append(self._sunidDict[int(d[0])]) 431 432 else: 433 cur.execute("SELECT cla.sunid,sid,pdbid,residues,cla.sccs,type,description,sp\ 434 FROM cla,des where cla.sunid=des.sunid and cla." + node.type + "=%s", 435 node.sunid) 436 437 data = cur.fetchall() 438 for d in data: 439 if int(d[0]) not in self._sunidDict: 440 n = Domain(scop=self) 441 n.sunid, n.sid, pdbid, n.residues, n.sccs, n.type, n.description, n.parent = d[0:8] 442 n.residues = Residues.Residues(n.residues) 443 n.residues.pdbid = pdbid 444 n.sunid = int(n.sunid) 445 self._sunidDict[n.sunid] = n 446 self._sidDict[n.sid] = n 447 448 des_list.append(self._sunidDict[int(d[0])]) 449 450 return des_list
451
452 - def write_hie_sql(self, handle):
453 """Write HIE data to SQL database.""" 454 cur = handle.cursor() 455 456 cur.execute("DROP TABLE IF EXISTS hie") 457 cur.execute("CREATE TABLE hie (parent INT, child INT, PRIMARY KEY (child),\ 458 INDEX (parent) )") 459 460 for p in self._sunidDict.values(): 461 for c in p.children: 462 cur.execute("INSERT INTO hie VALUES (%s,%s)" % (p.sunid, c.sunid))
463
464 - def write_cla_sql(self, handle):
465 """Write CLA data to SQL database.""" 466 cur = handle.cursor() 467 468 cur.execute("DROP TABLE IF EXISTS cla") 469 cur.execute("CREATE TABLE cla (sunid INT, sid CHAR(8), pdbid CHAR(4),\ 470 residues VARCHAR(50), sccs CHAR(10), cl INT, cf INT, sf INT, fa INT,\ 471 dm INT, sp INT, px INT, PRIMARY KEY (sunid), INDEX (SID) )") 472 473 for n in self._sidDict.values(): 474 c = n.toClaRecord() 475 cur.execute("INSERT INTO cla VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)", 476 (n.sunid, n.sid, c.residues.pdbid, c.residues, n.sccs, 477 n.getAscendent('cl').sunid, n.getAscendent('cf').sunid, 478 n.getAscendent('sf').sunid, n.getAscendent('fa').sunid, 479 n.getAscendent('dm').sunid, n.getAscendent('sp').sunid, 480 n.sunid))
481
482 - def write_des_sql(self, handle):
483 """Write DES data to SQL database.""" 484 cur = handle.cursor() 485 486 cur.execute("DROP TABLE IF EXISTS des") 487 cur.execute("CREATE TABLE des (sunid INT, type CHAR(2), sccs CHAR(10),\ 488 description VARCHAR(255),\ 489 PRIMARY KEY (sunid) )") 490 491 for n in self._sunidDict.values(): 492 cur.execute("INSERT INTO des VALUES (%s,%s,%s,%s)", 493 (n.sunid, n.type, n.sccs, n.description))
494 495
496 -class Node(object):
497 """A node in the Scop hierarchy. 498 499 Attributes: 500 - sunid -- SCOP unique identifiers. e.g. '14986' 501 - parent -- The parent node 502 - children -- A list of child nodes 503 - sccs -- SCOP concise classification string. e.g. 'a.1.1.2' 504 - type -- A 2 letter node type code. e.g. 'px' for domains 505 - description -- Description text. 506 507 """ 508
509 - def __init__(self, scop=None):
510 """Initialize a Node in the scop hierarchy. 511 512 If a Scop instance is provided to the constructor, this will be used 513 to lookup related references using the SQL methods. If no instance 514 is provided, it is assumed the whole tree exists and is connected. 515 """ 516 self.sunid = '' 517 self.parent = None 518 self.children = [] 519 self.sccs = '' 520 self.type = '' 521 self.description = '' 522 self.scop = scop
523
524 - def __str__(self):
525 """Represent the node as a string.""" 526 s = [] 527 s.append(str(self.sunid)) 528 s.append(self.sccs) 529 s.append(self.type) 530 s.append(self.description) 531 532 return " ".join(s)
533
534 - def toHieRecord(self):
535 """Return an Hie.Record.""" 536 rec = Hie.Record() 537 rec.sunid = str(self.sunid) 538 if self.getParent(): # Not root node 539 rec.parent = str(self.getParent().sunid) 540 else: 541 rec.parent = '-' 542 for c in self.getChildren(): 543 rec.children.append(str(c.sunid)) 544 return rec
545
546 - def toDesRecord(self):
547 """Return a Des.Record.""" 548 rec = Des.Record() 549 rec.sunid = str(self.sunid) 550 rec.nodetype = self.type 551 rec.sccs = self.sccs 552 rec.description = self.description 553 return rec
554
555 - def getChildren(self):
556 """Return a list of children of this Node.""" 557 if self.scop is None: 558 return self.children 559 else: 560 return [self.scop.getNodeBySunid(x) for x in self.children]
561
562 - def getParent(self):
563 """Return the parent of this Node.""" 564 if self.scop is None: 565 return self.parent 566 else: 567 return self.scop.getNodeBySunid(self.parent)
568
569 - def getDescendents(self, node_type):
570 """Return a list of all descendant nodes of the given type. 571 572 Node type can be a two letter code or longer description, 573 e.g. 'fa' or 'family'. 574 """ 575 if node_type in _nodetype_to_code: 576 node_type = _nodetype_to_code[node_type] 577 578 nodes = [self] 579 if self.scop: 580 return self.scop.getDescendentsFromSQL(self, node_type) 581 while nodes[0].type != node_type: 582 if nodes[0].type == 'px': 583 return [] # Fell of the bottom of the hierarchy 584 child_list = [] 585 for n in nodes: 586 for child in n.getChildren(): 587 child_list.append(child) 588 nodes = child_list 589 590 return nodes
591
592 - def getAscendent(self, node_type):
593 """Return the ancenstor node of the given type, or None. 594 595 Node type can be a two letter code or longer description, 596 e.g. 'fa' or 'family'. 597 """ 598 if node_type in _nodetype_to_code: 599 node_type = _nodetype_to_code[node_type] 600 601 if self.scop: 602 return self.scop.getAscendentFromSQL(self, node_type) 603 else: 604 n = self 605 if n.type == node_type: 606 return None 607 608 while n.type != node_type: 609 if n.type == 'ro': 610 return None # Fell of the top of the hierarchy 611 n = n.getParent() 612 613 return n
614 615
616 -class Domain(Node):
617 """A SCOP domain. A leaf node in the Scop hierarchy. 618 619 Attributes: 620 - sid - The SCOP domain identifier. e.g. ``"d5hbib_"`` 621 - residues - A Residue object. It defines the collection of PDB 622 atoms that make up this domain. 623 624 """ 625
626 - def __init__(self, scop=None):
627 """Initialize a SCOP Domain object.""" 628 Node.__init__(self, scop=scop) 629 self.sid = '' 630 self.residues = None
631
632 - def __str__(self):
633 """Represent the SCOP Domain as a string.""" 634 s = [] 635 s.append(self.sid) 636 s.append(self.sccs) 637 s.append("(" + str(self.residues) + ")") 638 639 if not self.getParent(): 640 s.append(self.description) 641 else: 642 sp = self.getParent() 643 dm = sp.getParent() 644 s.append(dm.description) 645 s.append("{" + sp.description + "}") 646 647 return " ".join(s)
648
649 - def toDesRecord(self):
650 """Return a Des.Record.""" 651 rec = Node.toDesRecord(self) 652 rec.name = self.sid 653 return rec
654
655 - def toClaRecord(self):
656 """Return a Cla.Record.""" 657 rec = Cla.Record() 658 rec.sid = self.sid 659 rec.residues = self.residues 660 rec.sccs = self.sccs 661 rec.sunid = self.sunid 662 663 n = self 664 while n.sunid != 0: # Not root node 665 rec.hierarchy[n.type] = str(n.sunid) 666 n = n.getParent() 667 668 # Order does not matter in the hierarchy field. For more info, see 669 # http://scop.mrc-lmb.cam.ac.uk/scop/release-notes.html 670 # rec.hierarchy.reverse() 671 672 return rec
673 674
675 -class Astral(object):
676 """Representation of the ASTRAL database. 677 678 Abstraction of the ASTRAL database, which has sequences for all the SCOP domains, 679 as well as clusterings by percent id or evalue. 680 """ 681
682 - def __init__(self, dir_path=None, version=None, scop=None, 683 astral_file=None, db_handle=None):
684 """Initialize the astral database. 685 686 You must provide either a directory of SCOP files: 687 - dir_path - string, the path to location of the scopseq-x.xx directory 688 (not the directory itself), and 689 - version -a version number. 690 691 or, a FASTA file: 692 - astral_file - string, a path to a fasta file (which will be loaded in memory) 693 694 or, a MYSQL database: 695 - db_handle - a database handle for a MYSQL database containing a table 696 'astral' with the astral data in it. This can be created 697 using writeToSQL. 698 699 """ 700 if astral_file is None and dir_path is None and db_handle is None: 701 raise RuntimeError("Need either file handle, or (dir_path + version)," 702 " or database handle to construct Astral") 703 if not scop: 704 raise RuntimeError("Must provide a Scop instance to construct") 705 706 self.scop = scop 707 self.db_handle = db_handle 708 709 if not astral_file and not db_handle: 710 if dir_path is None or version is None: 711 raise RuntimeError("must provide dir_path and version") 712 713 self.version = version 714 self.path = os.path.join(dir_path, "scopseq-%s" % version) 715 astral_file = "astral-scopdom-seqres-all-%s.fa" % self.version 716 astral_file = os.path.join(self.path, astral_file) 717 718 if astral_file: 719 # Build a dictionary of SeqRecord objects in the FASTA file, IN MEMORY 720 self.fasta_dict = SeqIO.to_dict(SeqIO.parse(astral_file, "fasta")) 721 722 self.astral_file = astral_file 723 self.EvDatasets = {} 724 self.EvDatahash = {} 725 self.IdDatasets = {} 726 self.IdDatahash = {}
727
728 - def domainsClusteredByEv(self, id):
729 """Get domains clustered by evalue.""" 730 if id not in self.EvDatasets: 731 if self.db_handle: 732 self.EvDatasets[id] = self.getAstralDomainsFromSQL(astralEv_to_sql[id]) 733 734 else: 735 if not self.path: 736 raise RuntimeError("No scopseq directory specified") 737 738 file_prefix = "astral-scopdom-seqres-sel-gs" 739 filename = "%s-e100m-%s-%s.id" % (file_prefix, astralEv_to_file[id], 740 self.version) 741 filename = os.path.join(self.path, filename) 742 self.EvDatasets[id] = self.getAstralDomainsFromFile(filename) 743 return self.EvDatasets[id]
744
745 - def domainsClusteredById(self, id):
746 """Get domains clustered by percentage identity.""" 747 if id not in self.IdDatasets: 748 if self.db_handle: 749 self.IdDatasets[id] = self.getAstralDomainsFromSQL("id" + str(id)) 750 else: 751 if not self.path: 752 raise RuntimeError("No scopseq directory specified") 753 754 file_prefix = "astral-scopdom-seqres-sel-gs" 755 filename = "%s-bib-%s-%s.id" % (file_prefix, id, self.version) 756 filename = os.path.join(self.path, filename) 757 self.IdDatasets[id] = self.getAstralDomainsFromFile(filename) 758 return self.IdDatasets[id]
759
760 - def getAstralDomainsFromFile(self, filename=None, file_handle=None):
761 """Get the scop domains from a file containing a list of sids.""" 762 if file_handle is None and filename is None: 763 raise RuntimeError("You must provide a filename or handle") 764 if not file_handle: 765 file_handle = open(filename) 766 doms = [] 767 while True: 768 line = file_handle.readline() 769 if not line: 770 break 771 line = line.rstrip() 772 doms.append(line) 773 if filename: 774 file_handle.close() 775 776 doms = [a for a in doms if a[0] == 'd'] 777 doms = [self.scop.getDomainBySid(x) for x in doms] 778 return doms
779
780 - def getAstralDomainsFromSQL(self, column):
781 """Load ASTRAL domains from the MySQL database. 782 783 Load a set of astral domains from a column in the astral table of a MYSQL 784 database (which can be created with writeToSQL(...). 785 """ 786 cur = self.db_handle.cursor() 787 cur.execute("SELECT sid FROM astral WHERE " + column + "=1") 788 data = cur.fetchall() 789 data = [self.scop.getDomainBySid(x[0]) for x in data] 790 791 return data
792
793 - def getSeqBySid(self, domain):
794 """Get the seq record of a given domain from its sid.""" 795 if self.db_handle is None: 796 return self.fasta_dict[domain].seq 797 else: 798 cur = self.db_handle.cursor() 799 cur.execute("SELECT seq FROM astral WHERE sid=%s", domain) 800 return Seq(cur.fetchone()[0])
801
802 - def getSeq(self, domain):
803 """Return seq associated with domain.""" 804 return self.getSeqBySid(domain.sid)
805
806 - def hashedDomainsById(self, id):
807 """Get domains clustered by sequence identity in a dict.""" 808 if id not in self.IdDatahash: 809 self.IdDatahash[id] = {} 810 for d in self.domainsClusteredById(id): 811 self.IdDatahash[id][d] = 1 812 return self.IdDatahash[id]
813
814 - def hashedDomainsByEv(self, id):
815 """Get domains clustered by evalue in a dict.""" 816 if id not in self.EvDatahash: 817 self.EvDatahash[id] = {} 818 for d in self.domainsClusteredByEv(id): 819 self.EvDatahash[id][d] = 1 820 return self.EvDatahash[id]
821
822 - def isDomainInId(self, dom, id):
823 """Return true if the domain is in the astral clusters for percent ID.""" 824 return dom in self.hashedDomainsById(id)
825
826 - def isDomainInEv(self, dom, id):
827 """Return true if the domain is in the ASTRAL clusters for evalues.""" 828 return dom in self.hashedDomainsByEv(id)
829
830 - def writeToSQL(self, db_handle):
831 """Write the ASTRAL database to a MYSQL database.""" 832 cur = db_handle.cursor() 833 834 cur.execute("DROP TABLE IF EXISTS astral") 835 cur.execute("CREATE TABLE astral (sid CHAR(8), seq TEXT, PRIMARY KEY (sid))") 836 837 for dom in self.fasta_dict: 838 cur.execute("INSERT INTO astral (sid,seq) values (%s,%s)", 839 (dom, self.fasta_dict[dom].seq.data)) 840 841 for i in astralBibIds: 842 cur.execute("ALTER TABLE astral ADD (id" + str(i) + " TINYINT)") 843 844 for d in self.domainsClusteredById(i): 845 cur.execute("UPDATE astral SET id" + str(i) + "=1 WHERE sid=%s", 846 d.sid) 847 848 for ev in astralEvs: 849 cur.execute("ALTER TABLE astral ADD (" + astralEv_to_sql[ev] + " TINYINT)") 850 851 for d in self.domainsClusteredByEv(ev): 852 cur.execute("UPDATE astral SET " + astralEv_to_sql[ev] + "=1 WHERE sid=%s", 853 d.sid)
854 855
856 -def search(pdb=None, key=None, sid=None, disp=None, dir=None, loc=None, 857 cgi='http://scop.mrc-lmb.cam.ac.uk/scop/search.cgi', **keywds):
858 """Access SCOP search and return a handle to the results. 859 860 Access search.cgi and return a handle to the results. See the 861 online help file for an explanation of the parameters: 862 http://scop.mrc-lmb.cam.ac.uk/scop/help.html 863 864 Raises an IOError if there's a network error. 865 866 """ 867 params = {'pdb': pdb, 'key': key, 'sid': sid, 'disp': disp, 868 'dir': dir, 'loc': loc} 869 variables = {} 870 for k, v in params.items(): 871 if v is not None: 872 variables[k] = v 873 variables.update(keywds) 874 return _open(cgi, variables)
875 876
877 -def _open(cgi, params=None, get=1):
878 """Open a handle to SCOP, returns an UndoHandle. 879 880 Open a handle to SCOP. cgi is the URL for the cgi script to access. 881 params is a dictionary with the options to pass to it. get is a boolean 882 that describes whether a GET should be used. Does some 883 simple error checking, and will raise an IOError if it encounters one. 884 885 """ 886 from Bio._py3k import urlopen, urlencode 887 888 # Open a handle to SCOP. 889 if params is None: 890 params = {} 891 options = urlencode(params) 892 if get: # do a GET 893 if options: 894 cgi += "?" + options 895 handle = urlopen(cgi) 896 else: # do a POST 897 handle = urlopen(cgi, data=options) 898 return handle
899