Package Bio :: Package SCOP
[hide private]
[frames] | no frames]

Source Code for Package Bio.SCOP

  1  # Copyright 2001 by Gavin E. Crooks.  All rights reserved. 
  2  # Modifications Copyright 2004/2005 James Casbon. All rights Reserved. 
  3  # Modifications Copyright 2010 Jeffrey Finkelstein. All rights reserved. 
  4  # 
  5  # This code is part of the Biopython distribution and governed by its 
  6  # license.  Please see the LICENSE file that should have been included 
  7  # as part of this package. 
  8  # 
  9  # Changes made by James Casbon: 
 10  # - New Astral class 
 11  # - SQL functionality for both Scop and Astral classes 
 12  # - All sunids are int not strings 
 13  # 
 14  # Code written by Jeffrey Chang to access SCOP over the internet, which 
 15  # was previously in Bio.WWW.SCOP, has now been merged into this module. 
 16   
 17   
 18  """SCOP: Structural Classification of Proteins. 
 19   
 20  The SCOP database aims to provide a manually constructed classification of 
 21  all know protein structures into a hierarchy, the main levels of which 
 22  are family, superfamily and fold. 
 23   
 24  * "SCOP":http://scop.mrc-lmb.cam.ac.uk/scop/ 
 25  * "Introduction":http://scop.mrc-lmb.cam.ac.uk/scop/intro.html 
 26  * "SCOP parsable files":http://scop.mrc-lmb.cam.ac.uk/scop/parse/ 
 27   
 28  The Scop object in this module represents the entire SCOP classification. It 
 29  can be built from the three SCOP parsable files, modified is so desired, and 
 30  converted back to the same file formats. A single SCOP domain (represented 
 31  by the Domain class) can be obtained from Scop using the domain's SCOP 
 32  identifier (sid). 
 33   
 34  - nodeCodeDict -- A mapping between known 2 letter node codes and a longer 
 35                    description. The known node types are 'cl' (class), 'cf' 
 36                    (fold), 'sf' (superfamily), 'fa' (family), 'dm' (domain), 
 37                    'sp' (species), 'px' (domain). Additional node types may 
 38                    be added in the future. 
 39   
 40  This module also provides code to access SCOP over the WWW. 
 41   
 42  Functions: 
 43   - search        -- Access the main CGI script. 
 44   - _open         -- Internally used function. 
 45   
 46  """ 
 47   
 48  from __future__ import print_function 
 49   
 50  import os 
 51  import re 
 52   
 53  from . import Des 
 54  from . import Cla 
 55  from . import Hie 
 56  from . import Residues 
 57  from Bio import SeqIO 
 58  from Bio.Seq import Seq 
 59   
 60   
 61  nodeCodeDict = {'cl': 'class', 'cf': 'fold', 'sf': 'superfamily', 
 62                  'fa': 'family', 'dm': 'protein', 'sp': 'species', 'px': 'domain'} 
 63   
 64   
 65  _nodetype_to_code = {'class': 'cl', 'fold': 'cf', 'superfamily': 'sf', 
 66                       'family': 'fa', 'protein': 'dm', 'species': 'sp', 'domain': 'px'} 
 67   
 68  nodeCodeOrder = ['ro', 'cl', 'cf', 'sf', 'fa', 'dm', 'sp', 'px'] 
 69   
 70  astralBibIds = [10, 20, 25, 30, 35, 40, 50, 70, 90, 95, 100] 
 71   
 72  astralEvs = [10, 5, 1, 0.5, 0.1, 0.05, 0.01, 0.005, 0.001, 1e-4, 1e-5, 1e-10, 1e-15, 
 73               1e-20, 1e-25, 1e-50] 
 74   
 75  astralEv_to_file = {10: 'e+1', 5: 'e+0,7', 1: 'e+0', 0.5: 'e-0,3', 0.1: 'e-1', 
 76                      0.05: 'e-1,3', 0.01: 'e-2', 0.005: 'e-2,3', 0.001: 'e-3', 
 77                      1e-4: 'e-4', 1e-5: 'e-5', 1e-10: 'e-10', 1e-15: 'e-15', 
 78                      1e-20: 'e-20', 1e-25: 'e-25', 1e-50: 'e-50'} 
 79   
 80  astralEv_to_sql = {10: 'e1', 5: 'e0_7', 1: 'e0', 0.5: 'e_0_3', 0.1: 'e_1', 
 81                     0.05: 'e_1_3', 0.01: 'e_2', 0.005: 'e_2_3', 0.001: 'e_3', 
 82                     1e-4: 'e_4', 1e-5: 'e_5', 1e-10: 'e_10', 1e-15: 'e_15', 
 83                     1e-20: 'e_20', 1e-25: 'e_25', 1e-50: 'e_50'} 
 84   
 85  try: 
 86      # See if the cmp function exists (will on Python 2) 
 87      _cmp = cmp 
 88  except NameError: 
89 - def _cmp(a, b):
90 """Implement cmp(x,y) for Python 3 (PRIVATE). 91 92 Based on Python 3 docs which say if you really need the cmp() 93 functionality, you could use the expression (a > b) - (a < b) 94 as the equivalent for cmp(a, b) 95 """ 96 return (a > b) - (a < b)
97 98
99 -def cmp_sccs(sccs1, sccs2):
100 """Order SCOP concise classification strings (sccs). 101 102 a.4.5.1 < a.4.5.11 < b.1.1.1 103 104 A sccs (e.g. a.4.5.11) compactly represents a domain's classification. 105 The letter represents the class, and the numbers are the fold, 106 superfamily, and family, respectively. 107 """ 108 s1 = sccs1.split(".") 109 s2 = sccs2.split(".") 110 111 if s1[0] != s2[0]: 112 return _cmp(s1[0], s2[0]) 113 114 s1 = [int(x) for x in s1[1:]] 115 s2 = [int(x) for x in s2[1:]] 116 117 return _cmp(s1, s2)
118 119 120 _domain_re = re.compile(r">?([\w_\.]*)\s+([\w\.]*)\s+\(([^)]*)\) (.*)") 121 122
123 -def parse_domain(str):
124 """Convert an ASTRAL header string into a Scop domain. 125 126 An ASTRAL (http://astral.stanford.edu/) header contains a concise 127 description of a SCOP domain. A very similar format is used when a 128 Domain object is converted into a string. The Domain returned by this 129 method contains most of the SCOP information, but it will not be located 130 within the SCOP hierarchy (i.e. The parent node will be None). The 131 description is composed of the SCOP protein and species descriptions. 132 133 A typical ASTRAL header looks like -- 134 >d1tpt_1 a.46.2.1 (1-70) Thymidine phosphorylase {Escherichia coli} 135 """ 136 m = _domain_re.match(str) 137 if (not m): 138 raise ValueError("Domain: " + str) 139 140 dom = Domain() 141 dom.sid = m.group(1) 142 dom.sccs = m.group(2) 143 dom.residues = Residues.Residues(m.group(3)) 144 if not dom.residues.pdbid: 145 dom.residues.pdbid = dom.sid[1:5] 146 dom.description = m.group(4).strip() 147 148 return dom
149 150
151 -def _open_scop_file(scop_dir_path, version, filetype):
152 filename = "dir.%s.scop.txt_%s" % (filetype, version) 153 handle = open(os.path.join(scop_dir_path, filename)) 154 return handle
155 156
157 -class Scop(object):
158 """The entire SCOP hierarchy. 159 160 root -- The root node of the hierarchy 161 """ 162
163 - def __init__(self, cla_handle=None, des_handle=None, hie_handle=None, 164 dir_path=None, db_handle=None, version=None):
165 """Build the SCOP hierarchy from the SCOP parsable files, or a sql backend. 166 167 If no file handles are given, then a Scop object with a single 168 empty root node is returned. 169 170 If a directory and version are given (with dir_path=.., version=...) or 171 file handles for each file, the whole scop tree will be built in memory. 172 173 If a MySQLdb database handle is given, the tree will be built as needed, 174 minimising construction times. To build the SQL database to the methods 175 write_xxx_sql to create the tables. 176 177 """ 178 self._sidDict = {} 179 self._sunidDict = {} 180 181 if all(h is None for h in [cla_handle, des_handle, hie_handle, dir_path, db_handle]): 182 return 183 184 if dir_path is None and db_handle is None: 185 if cla_handle is None or des_handle is None or hie_handle is None: 186 raise RuntimeError("Need CLA, DES and HIE files to build SCOP") 187 188 sunidDict = {} 189 190 self.db_handle = db_handle 191 try: 192 if db_handle: 193 # do nothing if we have a db handle, we'll do it all on the fly 194 pass 195 else: 196 # open SCOP parseable files 197 if dir_path: 198 if not version: 199 raise RuntimeError("Need SCOP version to find parsable files in directory") 200 if cla_handle or des_handle or hie_handle: 201 raise RuntimeError("Cannot specify SCOP directory and specific files") 202 203 cla_handle = _open_scop_file(dir_path, version, 'cla') 204 des_handle = _open_scop_file(dir_path, version, 'des') 205 hie_handle = _open_scop_file(dir_path, version, 'hie') 206 207 root = Node() 208 domains = [] 209 root.sunid = 0 210 root.type = 'ro' 211 sunidDict[root.sunid] = root 212 self.root = root 213 root.description = 'SCOP Root' 214 215 # Build the rest of the nodes using the DES file 216 records = Des.parse(des_handle) 217 for record in records: 218 if record.nodetype == 'px': 219 n = Domain() 220 n.sid = record.name 221 domains.append(n) 222 else: 223 n = Node() 224 n.sunid = record.sunid 225 n.type = record.nodetype 226 n.sccs = record.sccs 227 n.description = record.description 228 229 sunidDict[n.sunid] = n 230 231 # Glue all of the Nodes together using the HIE file 232 records = Hie.parse(hie_handle) 233 for record in records: 234 if record.sunid not in sunidDict: 235 print(record.sunid) 236 237 n = sunidDict[record.sunid] 238 239 if record.parent != '': # Not root node 240 241 if record.parent not in sunidDict: 242 raise ValueError("Incomplete data?") 243 244 n.parent = sunidDict[record.parent] 245 246 for c in record.children: 247 if c not in sunidDict: 248 raise ValueError("Incomplete data?") 249 n.children.append(sunidDict[c]) 250 251 # Fill in the gaps with information from the CLA file 252 sidDict = {} 253 records = Cla.parse(cla_handle) 254 for record in records: 255 n = sunidDict[record.sunid] 256 assert n.sccs == record.sccs 257 assert n.sid == record.sid 258 n.residues = record.residues 259 sidDict[n.sid] = n 260 261 # Clean up 262 self._sunidDict = sunidDict 263 self._sidDict = sidDict 264 self._domains = tuple(domains) 265 266 finally: 267 if dir_path: 268 # If we opened the files, we close the files 269 if cla_handle: 270 cla_handle.close() 271 if des_handle: 272 des_handle.close() 273 if hie_handle: 274 hie_handle.close()
275
276 - def getRoot(self):
277 """Get root node.""" 278 return self.getNodeBySunid(0)
279
280 - def getDomainBySid(self, sid):
281 """Return a domain from its sid.""" 282 if sid in self._sidDict: 283 return self._sidDict[sid] 284 if self.db_handle: 285 self.getDomainFromSQL(sid=sid) 286 if sid in self._sidDict: 287 return self._sidDict[sid] 288 else: 289 return None
290
291 - def getNodeBySunid(self, sunid):
292 """Return a node from its sunid.""" 293 if sunid in self._sunidDict: 294 return self._sunidDict[sunid] 295 if self.db_handle: 296 self.getDomainFromSQL(sunid=sunid) 297 if sunid in self._sunidDict: 298 return self._sunidDict[sunid] 299 else: 300 return None
301
302 - def getDomains(self):
303 """Return an ordered tuple of all SCOP Domains.""" 304 if self.db_handle: 305 return self.getRoot().getDescendents('px') 306 else: 307 return self._domains
308
309 - def write_hie(self, handle):
310 """Build an HIE SCOP parsable file from this object.""" 311 # We order nodes to ease comparison with original file 312 for n in sorted(self._sunidDict.values(), key=lambda n: n.sunid): 313 handle.write(str(n.toHieRecord()))
314
315 - def write_des(self, handle):
316 """Build a DES SCOP parsable file from this object.""" 317 # Original SCOP file is not ordered? 318 for n in sorted(self._sunidDict.values(), key=lambda n: n.sunid): 319 if n != self.root: 320 handle.write(str(n.toDesRecord()))
321
322 - def write_cla(self, handle):
323 """Build a CLA SCOP parsable file from this object.""" 324 # We order nodes to ease comparison with original file 325 for n in sorted(self._sidDict.values(), key=lambda n: n.sunid): 326 handle.write(str(n.toClaRecord()))
327
328 - def getDomainFromSQL(self, sunid=None, sid=None):
329 """Load a node from the SQL backend using sunid or sid.""" 330 if sunid is None and sid is None: 331 return None 332 333 cur = self.db_handle.cursor() 334 335 if sid: 336 cur.execute("SELECT sunid FROM cla WHERE sid=%s", sid) 337 res = cur.fetchone() 338 if res is None: 339 return None 340 sunid = res[0] 341 342 cur.execute("SELECT * FROM des WHERE sunid=%s", sunid) 343 data = cur.fetchone() 344 345 if data is not None: 346 n = None 347 348 # determine if Node or Domain 349 if data[1] != "px": 350 n = Node(scop=self) 351 352 cur.execute("SELECT child FROM hie WHERE parent=%s", sunid) 353 children = [] 354 for c in cur.fetchall(): 355 children.append(c[0]) 356 n.children = children 357 else: 358 n = Domain(scop=self) 359 cur.execute("select sid, residues, pdbid from cla where sunid=%s", 360 sunid) 361 362 [n.sid, n.residues, pdbid] = cur.fetchone() 363 n.residues = Residues.Residues(n.residues) 364 n.residues.pdbid = pdbid 365 self._sidDict[n.sid] = n 366 367 [n.sunid, n.type, n.sccs, n.description] = data 368 369 if data[1] != 'ro': 370 cur.execute("SELECT parent FROM hie WHERE child=%s", sunid) 371 n.parent = cur.fetchone()[0] 372 373 n.sunid = int(n.sunid) 374 375 self._sunidDict[n.sunid] = n
376
377 - def getAscendentFromSQL(self, node, type):
378 """Get ascendents using SQL backend.""" 379 if nodeCodeOrder.index(type) >= nodeCodeOrder.index(node.type): 380 return None 381 382 cur = self.db_handle.cursor() 383 cur.execute("SELECT " + type + " from cla WHERE " + node.type + "=%s", (node.sunid)) 384 result = cur.fetchone() 385 if result is not None: 386 return self.getNodeBySunid(result[0]) 387 else: 388 return None
389
390 - def getDescendentsFromSQL(self, node, type):
391 """Get descendents of a node using the database backend. 392 393 This avoids repeated iteration of SQL calls and is therefore much 394 quicker than repeatedly calling node.getChildren(). 395 """ 396 if nodeCodeOrder.index(type) <= nodeCodeOrder.index(node.type): 397 return [] 398 399 des_list = [] 400 401 # SQL cla table knows nothing about 'ro' 402 if node.type == 'ro': 403 for c in node.getChildren(): 404 for d in self.getDescendentsFromSQL(c, type): 405 des_list.append(d) 406 return des_list 407 408 cur = self.db_handle.cursor() 409 410 if type != 'px': 411 cur.execute("SELECT DISTINCT des.sunid,des.type,des.sccs,description FROM \ 412 cla,des WHERE cla." + node.type + "=%s AND cla." + type + "=des.sunid", (node.sunid)) 413 data = cur.fetchall() 414 for d in data: 415 if int(d[0]) not in self._sunidDict: 416 n = Node(scop=self) 417 [n.sunid, n.type, n.sccs, n.description] = d 418 n.sunid = int(n.sunid) 419 self._sunidDict[n.sunid] = n 420 421 cur.execute("SELECT parent FROM hie WHERE child=%s", n.sunid) 422 n.parent = cur.fetchone()[0] 423 424 cur.execute("SELECT child FROM hie WHERE parent=%s", n.sunid) 425 children = [] 426 for c in cur.fetchall(): 427 children.append(c[0]) 428 n.children = children 429 430 des_list.append(self._sunidDict[int(d[0])]) 431 432 else: 433 cur.execute("SELECT cla.sunid,sid,pdbid,residues,cla.sccs,type,description,sp\ 434 FROM cla,des where cla.sunid=des.sunid and cla." + node.type + "=%s", 435 node.sunid) 436 437 data = cur.fetchall() 438 for d in data: 439 if int(d[0]) not in self._sunidDict: 440 n = Domain(scop=self) 441 # [n.sunid, n.sid, n.pdbid, n.residues, n.sccs, n.type, 442 # n.description,n.parent] = data 443 [n.sunid, n.sid, pdbid, n.residues, n.sccs, n.type, n.description, 444 n.parent] = d[0:8] 445 n.residues = Residues.Residues(n.residues) 446 n.residues.pdbid = pdbid 447 n.sunid = int(n.sunid) 448 self._sunidDict[n.sunid] = n 449 self._sidDict[n.sid] = n 450 451 des_list.append(self._sunidDict[int(d[0])]) 452 453 return des_list
454
455 - def write_hie_sql(self, handle):
456 """Write HIE data to SQL database.""" 457 cur = handle.cursor() 458 459 cur.execute("DROP TABLE IF EXISTS hie") 460 cur.execute("CREATE TABLE hie (parent INT, child INT, PRIMARY KEY (child),\ 461 INDEX (parent) )") 462 463 for p in self._sunidDict.values(): 464 for c in p.children: 465 cur.execute("INSERT INTO hie VALUES (%s,%s)" % (p.sunid, c.sunid))
466
467 - def write_cla_sql(self, handle):
468 """Write CLA data to SQL database.""" 469 cur = handle.cursor() 470 471 cur.execute("DROP TABLE IF EXISTS cla") 472 cur.execute("CREATE TABLE cla (sunid INT, sid CHAR(8), pdbid CHAR(4),\ 473 residues VARCHAR(50), sccs CHAR(10), cl INT, cf INT, sf INT, fa INT,\ 474 dm INT, sp INT, px INT, PRIMARY KEY (sunid), INDEX (SID) )") 475 476 for n in self._sidDict.values(): 477 c = n.toClaRecord() 478 cur.execute("INSERT INTO cla VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)", 479 (n.sunid, n.sid, c.residues.pdbid, c.residues, n.sccs, 480 n.getAscendent('cl').sunid, n.getAscendent('cf').sunid, 481 n.getAscendent('sf').sunid, n.getAscendent('fa').sunid, 482 n.getAscendent('dm').sunid, n.getAscendent('sp').sunid, 483 n.sunid))
484
485 - def write_des_sql(self, handle):
486 """Write DES data to SQL database.""" 487 cur = handle.cursor() 488 489 cur.execute("DROP TABLE IF EXISTS des") 490 cur.execute("CREATE TABLE des (sunid INT, type CHAR(2), sccs CHAR(10),\ 491 description VARCHAR(255),\ 492 PRIMARY KEY (sunid) )") 493 494 for n in self._sunidDict.values(): 495 cur.execute("INSERT INTO des VALUES (%s,%s,%s,%s)", 496 (n.sunid, n.type, n.sccs, n.description))
497 498
499 -class Node(object):
500 """A node in the Scop hierarchy. 501 502 Attributes: 503 - sunid -- SCOP unique identifiers. e.g. '14986' 504 - parent -- The parent node 505 - children -- A list of child nodes 506 - sccs -- SCOP concise classification string. e.g. 'a.1.1.2' 507 - type -- A 2 letter node type code. e.g. 'px' for domains 508 - description -- Description text. 509 510 """ 511
512 - def __init__(self, scop=None):
513 """Initialize a Node in the scop hierarchy. 514 515 If a Scop instance is provided to the constructor, this will be used 516 to lookup related references using the SQL methods. If no instance 517 is provided, it is assumed the whole tree exists and is connected. 518 """ 519 self.sunid = '' 520 self.parent = None 521 self.children = [] 522 self.sccs = '' 523 self.type = '' 524 self.description = '' 525 self.scop = scop
526
527 - def __str__(self):
528 """Represent the node as a string.""" 529 s = [] 530 s.append(str(self.sunid)) 531 s.append(self.sccs) 532 s.append(self.type) 533 s.append(self.description) 534 535 return " ".join(s)
536
537 - def toHieRecord(self):
538 """Return an Hie.Record.""" 539 rec = Hie.Record() 540 rec.sunid = str(self.sunid) 541 if self.getParent(): # Not root node 542 rec.parent = str(self.getParent().sunid) 543 else: 544 rec.parent = '-' 545 for c in self.getChildren(): 546 rec.children.append(str(c.sunid)) 547 return rec
548
549 - def toDesRecord(self):
550 """Return a Des.Record.""" 551 rec = Des.Record() 552 rec.sunid = str(self.sunid) 553 rec.nodetype = self.type 554 rec.sccs = self.sccs 555 rec.description = self.description 556 return rec
557
558 - def getChildren(self):
559 """Return a list of children of this Node.""" 560 if self.scop is None: 561 return self.children 562 else: 563 return [self.scop.getNodeBySunid(x) for x in self.children]
564
565 - def getParent(self):
566 """Return the parent of this Node.""" 567 if self.scop is None: 568 return self.parent 569 else: 570 return self.scop.getNodeBySunid(self.parent)
571
572 - def getDescendents(self, node_type):
573 """Return a list of all descendant nodes of the given type. 574 575 Node type can be a two letter code or longer description, 576 e.g. 'fa' or 'family'. 577 """ 578 if node_type in _nodetype_to_code: 579 node_type = _nodetype_to_code[node_type] 580 581 nodes = [self] 582 if self.scop: 583 return self.scop.getDescendentsFromSQL(self, node_type) 584 while nodes[0].type != node_type: 585 if nodes[0].type == 'px': 586 return [] # Fell of the bottom of the hierarchy 587 child_list = [] 588 for n in nodes: 589 for child in n.getChildren(): 590 child_list.append(child) 591 nodes = child_list 592 593 return nodes
594
595 - def getAscendent(self, node_type):
596 """Return the ancenstor node of the given type, or None. 597 598 Node type can be a two letter code or longer description, 599 e.g. 'fa' or 'family'. 600 """ 601 if node_type in _nodetype_to_code: 602 node_type = _nodetype_to_code[node_type] 603 604 if self.scop: 605 return self.scop.getAscendentFromSQL(self, node_type) 606 else: 607 n = self 608 if n.type == node_type: 609 return None 610 611 while n.type != node_type: 612 if n.type == 'ro': 613 return None # Fell of the top of the hierarchy 614 n = n.getParent() 615 616 return n
617 618
619 -class Domain(Node):
620 """A SCOP domain. A leaf node in the Scop hierarchy. 621 622 Attributes: 623 - sid - The SCOP domain identifier. e.g. ``"d5hbib_"`` 624 - residues - A Residue object. It defines the collection of PDB 625 atoms that make up this domain. 626 627 """ 628
629 - def __init__(self, scop=None):
630 """Initialize a SCOP Domain object.""" 631 Node.__init__(self, scop=scop) 632 self.sid = '' 633 self.residues = None
634
635 - def __str__(self):
636 """Represent the SCOP Domain as a string.""" 637 s = [] 638 s.append(self.sid) 639 s.append(self.sccs) 640 s.append("(" + str(self.residues) + ")") 641 642 if not self.getParent(): 643 s.append(self.description) 644 else: 645 sp = self.getParent() 646 dm = sp.getParent() 647 s.append(dm.description) 648 s.append("{" + sp.description + "}") 649 650 return " ".join(s)
651
652 - def toDesRecord(self):
653 """Return a Des.Record.""" 654 rec = Node.toDesRecord(self) 655 rec.name = self.sid 656 return rec
657
658 - def toClaRecord(self):
659 """Return a Cla.Record.""" 660 rec = Cla.Record() 661 rec.sid = self.sid 662 rec.residues = self.residues 663 rec.sccs = self.sccs 664 rec.sunid = self.sunid 665 666 n = self 667 while n.sunid != 0: # Not root node 668 rec.hierarchy[n.type] = str(n.sunid) 669 n = n.getParent() 670 671 # Order does not matter in the hierarchy field. For more info, see 672 # http://scop.mrc-lmb.cam.ac.uk/scop/release-notes.html 673 # rec.hierarchy.reverse() 674 675 return rec
676 677
678 -class Astral(object):
679 """Representation of the ASTRAL database. 680 681 Abstraction of the ASTRAL database, which has sequences for all the SCOP domains, 682 as well as clusterings by percent id or evalue. 683 """ 684
685 - def __init__(self, dir_path=None, version=None, scop=None, 686 astral_file=None, db_handle=None):
687 """Initialize the astral database. 688 689 You must provide either a directory of SCOP files: 690 - dir_path - string, the path to location of the scopseq-x.xx directory 691 (not the directory itself), and 692 - version -a version number. 693 694 or, a FASTA file: 695 - astral_file - string, a path to a fasta file (which will be loaded in memory) 696 697 or, a MYSQL database: 698 - db_handle - a database handle for a MYSQL database containing a table 699 'astral' with the astral data in it. This can be created 700 using writeToSQL. 701 702 """ 703 if astral_file is None and dir_path is None and db_handle is None: 704 raise RuntimeError("Need either file handle, or (dir_path + version)," 705 " or database handle to construct Astral") 706 if not scop: 707 raise RuntimeError("Must provide a Scop instance to construct") 708 709 self.scop = scop 710 self.db_handle = db_handle 711 712 if not astral_file and not db_handle: 713 if dir_path is None or version is None: 714 raise RuntimeError("must provide dir_path and version") 715 716 self.version = version 717 self.path = os.path.join(dir_path, "scopseq-%s" % version) 718 astral_file = "astral-scopdom-seqres-all-%s.fa" % self.version 719 astral_file = os.path.join(self.path, astral_file) 720 721 if astral_file: 722 # Build a dictionary of SeqRecord objects in the FASTA file, IN MEMORY 723 self.fasta_dict = SeqIO.to_dict(SeqIO.parse(astral_file, "fasta")) 724 725 self.astral_file = astral_file 726 self.EvDatasets = {} 727 self.EvDatahash = {} 728 self.IdDatasets = {} 729 self.IdDatahash = {}
730
731 - def domainsClusteredByEv(self, id):
732 """Get domains clustered by evalue.""" 733 if id not in self.EvDatasets: 734 if self.db_handle: 735 self.EvDatasets[id] = self.getAstralDomainsFromSQL(astralEv_to_sql[id]) 736 737 else: 738 if not self.path: 739 raise RuntimeError("No scopseq directory specified") 740 741 file_prefix = "astral-scopdom-seqres-sel-gs" 742 filename = "%s-e100m-%s-%s.id" % (file_prefix, astralEv_to_file[id], 743 self.version) 744 filename = os.path.join(self.path, filename) 745 self.EvDatasets[id] = self.getAstralDomainsFromFile(filename) 746 return self.EvDatasets[id]
747
748 - def domainsClusteredById(self, id):
749 """Get domains clustered by percentage identity.""" 750 if id not in self.IdDatasets: 751 if self.db_handle: 752 self.IdDatasets[id] = self.getAstralDomainsFromSQL("id" + str(id)) 753 else: 754 if not self.path: 755 raise RuntimeError("No scopseq directory specified") 756 757 file_prefix = "astral-scopdom-seqres-sel-gs" 758 filename = "%s-bib-%s-%s.id" % (file_prefix, id, self.version) 759 filename = os.path.join(self.path, filename) 760 self.IdDatasets[id] = self.getAstralDomainsFromFile(filename) 761 return self.IdDatasets[id]
762
763 - def getAstralDomainsFromFile(self, filename=None, file_handle=None):
764 """Get the scop domains from a file containing a list of sids.""" 765 if file_handle is None and filename is None: 766 raise RuntimeError("You must provide a filename or handle") 767 if not file_handle: 768 file_handle = open(filename) 769 doms = [] 770 while True: 771 line = file_handle.readline() 772 if not line: 773 break 774 line = line.rstrip() 775 doms.append(line) 776 if filename: 777 file_handle.close() 778 779 doms = [a for a in doms if a[0] == 'd'] 780 doms = [self.scop.getDomainBySid(x) for x in doms] 781 return doms
782
783 - def getAstralDomainsFromSQL(self, column):
784 """Load ASTRAL domains from the MySQL database. 785 786 Load a set of astral domains from a column in the astral table of a MYSQL 787 database (which can be created with writeToSQL(...). 788 """ 789 cur = self.db_handle.cursor() 790 cur.execute("SELECT sid FROM astral WHERE " + column + "=1") 791 data = cur.fetchall() 792 data = [self.scop.getDomainBySid(x[0]) for x in data] 793 794 return data
795
796 - def getSeqBySid(self, domain):
797 """Get the seq record of a given domain from its sid.""" 798 if self.db_handle is None: 799 return self.fasta_dict[domain].seq 800 else: 801 cur = self.db_handle.cursor() 802 cur.execute("SELECT seq FROM astral WHERE sid=%s", domain) 803 return Seq(cur.fetchone()[0])
804
805 - def getSeq(self, domain):
806 """Return seq associated with domain.""" 807 return self.getSeqBySid(domain.sid)
808
809 - def hashedDomainsById(self, id):
810 """Get domains clustered by sequence identity in a dict.""" 811 if id not in self.IdDatahash: 812 self.IdDatahash[id] = {} 813 for d in self.domainsClusteredById(id): 814 self.IdDatahash[id][d] = 1 815 return self.IdDatahash[id]
816
817 - def hashedDomainsByEv(self, id):
818 """Get domains clustered by evalue in a dict.""" 819 if id not in self.EvDatahash: 820 self.EvDatahash[id] = {} 821 for d in self.domainsClusteredByEv(id): 822 self.EvDatahash[id][d] = 1 823 return self.EvDatahash[id]
824
825 - def isDomainInId(self, dom, id):
826 """Return true if the domain is in the astral clusters for percent ID.""" 827 return dom in self.hashedDomainsById(id)
828
829 - def isDomainInEv(self, dom, id):
830 """Return true if the domain is in the ASTRAL clusters for evalues.""" 831 return dom in self.hashedDomainsByEv(id)
832
833 - def writeToSQL(self, db_handle):
834 """Write the ASTRAL database to a MYSQL database.""" 835 cur = db_handle.cursor() 836 837 cur.execute("DROP TABLE IF EXISTS astral") 838 cur.execute("CREATE TABLE astral (sid CHAR(8), seq TEXT, PRIMARY KEY (sid))") 839 840 for dom in self.fasta_dict: 841 cur.execute("INSERT INTO astral (sid,seq) values (%s,%s)", 842 (dom, self.fasta_dict[dom].seq.data)) 843 844 for i in astralBibIds: 845 cur.execute("ALTER TABLE astral ADD (id" + str(i) + " TINYINT)") 846 847 for d in self.domainsClusteredById(i): 848 cur.execute("UPDATE astral SET id" + str(i) + "=1 WHERE sid=%s", 849 d.sid) 850 851 for ev in astralEvs: 852 cur.execute("ALTER TABLE astral ADD (" + astralEv_to_sql[ev] + " TINYINT)") 853 854 for d in self.domainsClusteredByEv(ev): 855 cur.execute("UPDATE astral SET " + astralEv_to_sql[ev] + "=1 WHERE sid=%s", 856 d.sid)
857 858
859 -def search(pdb=None, key=None, sid=None, disp=None, dir=None, loc=None, 860 cgi='http://scop.mrc-lmb.cam.ac.uk/scop/search.cgi', **keywds):
861 """Access SCOP search and return a handle to the results. 862 863 Access search.cgi and return a handle to the results. See the 864 online help file for an explanation of the parameters: 865 http://scop.mrc-lmb.cam.ac.uk/scop/help.html 866 867 Raises an IOError if there's a network error. 868 869 """ 870 params = {'pdb': pdb, 'key': key, 'sid': sid, 'disp': disp, 871 'dir': dir, 'loc': loc} 872 variables = {} 873 for k, v in params.items(): 874 if v is not None: 875 variables[k] = v 876 variables.update(keywds) 877 return _open(cgi, variables)
878 879
880 -def _open(cgi, params=None, get=1):
881 """Open a handle to SCOP, returns an UndoHandle. 882 883 Open a handle to SCOP. cgi is the URL for the cgi script to access. 884 params is a dictionary with the options to pass to it. get is a boolean 885 that describes whether a GET should be used. Does some 886 simple error checking, and will raise an IOError if it encounters one. 887 888 """ 889 from Bio._py3k import urlopen, urlencode 890 891 # Open a handle to SCOP. 892 if params is None: 893 params = {} 894 options = urlencode(params) 895 if get: # do a GET 896 if options: 897 cgi += "?" + options 898 handle = urlopen(cgi) 899 else: # do a POST 900 handle = urlopen(cgi, data=options) 901 return handle
902