Package Bio :: Package SCOP
[hide private]
[frames] | no frames]

Source Code for Package Bio.SCOP

  1  # Copyright 2001 by Gavin E. Crooks.  All rights reserved. 
  2  # Modifications Copyright 2004/2005 James Casbon. All rights Reserved. 
  3  # Modifications Copyright 2010 Jeffrey Finkelstein. All rights reserved. 
  4  # 
  5  # This code is part of the Biopython distribution and governed by its 
  6  # license.  Please see the LICENSE file that should have been included 
  7  # as part of this package. 
  8  # 
  9  # Changes made by James Casbon: 
 10  # - New Astral class 
 11  # - SQL functionality for both Scop and Astral classes 
 12  # - All sunids are int not strings 
 13  # 
 14  # Code written by Jeffrey Chang to access SCOP over the internet, which 
 15  # was previously in Bio.WWW.SCOP, has now been merged into this module. 
 16   
 17   
 18  """ SCOP: Structural Classification of Proteins. 
 19   
 20  The SCOP database aims to provide a manually constructed classification of 
 21  all know protein structures into a hierarchy, the main levels of which 
 22  are family, superfamily and fold. 
 23   
 24  * "SCOP":http://scop.mrc-lmb.cam.ac.uk/scop/ 
 25   
 26  * "Introduction":http://scop.mrc-lmb.cam.ac.uk/scop/intro.html 
 27   
 28  * "SCOP parsable files":http://scop.mrc-lmb.cam.ac.uk/scop/parse/ 
 29   
 30  The Scop object in this module represents the entire SCOP classification. It 
 31  can be built from the three SCOP parsable files, modified is so desired, and 
 32  converted back to the same file formats. A single SCOP domain (represented 
 33  by the Domain class) can be obtained from Scop using the domain's SCOP 
 34  identifier (sid). 
 35   
 36   
 37  nodeCodeDict  -- A mapping between known 2 letter node codes and a longer 
 38                    description. The known node types are 'cl' (class), 'cf' 
 39                    (fold), 'sf' (superfamily), 'fa' (family), 'dm' (domain), 
 40                    'sp' (species), 'px' (domain). Additional node types may 
 41                    be added in the future. 
 42   
 43  This module also provides code to access SCOP over the WWW. 
 44   
 45  Functions: 
 46  search        -- Access the main CGI script. 
 47  _open         -- Internally used function. 
 48   
 49  """ 
 50   
 51  from __future__ import print_function 
 52   
 53  import os 
 54  import re 
 55   
 56  from . import Des 
 57  from . import Cla 
 58  from . import Hie 
 59  from . import Residues 
 60  from Bio import SeqIO 
 61  from Bio.Seq import Seq 
 62   
 63   
 64  nodeCodeDict = { 'cl':'class', 'cf':'fold', 'sf':'superfamily', 
 65                   'fa':'family', 'dm':'protein', 'sp':'species', 'px':'domain'} 
 66   
 67   
 68  _nodetype_to_code= { 'class': 'cl', 'fold': 'cf', 'superfamily': 'sf', 
 69                       'family': 'fa', 'protein': 'dm', 'species': 'sp', 'domain': 'px'} 
 70   
 71  nodeCodeOrder = [ 'ro', 'cl', 'cf', 'sf', 'fa', 'dm', 'sp', 'px' ] 
 72   
 73  astralBibIds = [10, 20, 25, 30, 35, 40, 50, 70, 90, 95, 100] 
 74   
 75  astralEvs = [10, 5, 1, 0.5, 0.1, 0.05, 0.01, 0.005, 0.001, 1e-4, 1e-5, 1e-10, 1e-15, 
 76               1e-20, 1e-25, 1e-50] 
 77   
 78  astralEv_to_file = { 10: 'e+1', 5: 'e+0,7', 1: 'e+0', 0.5: 'e-0,3', 0.1: 'e-1', 
 79                       0.05: 'e-1,3', 0.01: 'e-2', 0.005: 'e-2,3', 0.001: 'e-3', 
 80                       1e-4: 'e-4',  1e-5: 'e-5', 1e-10: 'e-10', 1e-15: 'e-15', 
 81                       1e-20: 'e-20', 1e-25: 'e-25', 1e-50: 'e-50' } 
 82   
 83  astralEv_to_sql = { 10: 'e1', 5: 'e0_7', 1: 'e0', 0.5: 'e_0_3', 0.1: 'e_1', 
 84                       0.05: 'e_1_3', 0.01: 'e_2', 0.005: 'e_2_3', 0.001: 'e_3', 
 85                       1e-4: 'e_4',  1e-5: 'e_5', 1e-10: 'e_10', 1e-15: 'e_15', 
 86                       1e-20: 'e_20', 1e-25: 'e_25', 1e-50: 'e_50' } 
 87   
 88  try: 
 89      #See if the cmp function exists (will on Python 2) 
 90      _cmp = cmp 
 91  except NameError: 
92 - def _cmp(a, b):
93 """Implementation of cmp(x,y) for Python 3 (PRIVATE). 94 95 Based on Python 3 docs which say if you really need the cmp() 96 functionality, you could use the expression (a > b) - (a < b) 97 as the equivalent for cmp(a, b) 98 """ 99 return (a > b) - (a < b)
100 101
102 -def cmp_sccs(sccs1, sccs2):
103 """Order SCOP concise classification strings (sccs). 104 105 a.4.5.1 < a.4.5.11 < b.1.1.1 106 107 A sccs (e.g. a.4.5.11) compactly represents a domain's classification. 108 The letter represents the class, and the numbers are the fold, 109 superfamily, and family, respectively. 110 111 """ 112 113 s1 = sccs1.split(".") 114 s2 = sccs2.split(".") 115 116 if s1[0] != s2[0]: 117 return _cmp(s1[0], s2[0]) 118 119 s1 = [int(x) for x in s1[1:]] 120 s2 = [int(x) for x in s2[1:]] 121 122 return _cmp(s1, s2)
123 124 125 _domain_re = re.compile(r">?([\w_\.]*)\s+([\w\.]*)\s+\(([^)]*)\) (.*)") 126 127
128 -def parse_domain(str):
129 """Convert an ASTRAL header string into a Scop domain. 130 131 An ASTRAL (http://astral.stanford.edu/) header contains a concise 132 description of a SCOP domain. A very similar format is used when a 133 Domain object is converted into a string. The Domain returned by this 134 method contains most of the SCOP information, but it will not be located 135 within the SCOP hierarchy (i.e. The parent node will be None). The 136 description is composed of the SCOP protein and species descriptions. 137 138 A typical ASTRAL header looks like -- 139 >d1tpt_1 a.46.2.1 (1-70) Thymidine phosphorylase {Escherichia coli} 140 """ 141 142 m = _domain_re.match(str) 143 if (not m): 144 raise ValueError("Domain: "+ str) 145 146 dom = Domain() 147 dom.sid = m.group(1) 148 dom.sccs = m.group(2) 149 dom.residues = Residues.Residues(m.group(3)) 150 if not dom.residues.pdbid: 151 dom.residues.pdbid= dom.sid[1:5] 152 dom.description = m.group(4).strip() 153 154 return dom
155 156
157 -def _open_scop_file(scop_dir_path, version, filetype):
158 filename = "dir.%s.scop.txt_%s" % (filetype, version) 159 handle = open(os.path.join( scop_dir_path, filename)) 160 return handle
161 162
163 -class Scop(object):
164 """The entire SCOP hierarchy. 165 166 root -- The root node of the hierarchy 167 """
168 - def __init__(self, cla_handle=None, des_handle=None, hie_handle=None, 169 dir_path=None, db_handle=None, version=None):
170 """Build the SCOP hierarchy from the SCOP parsable files, or a sql backend. 171 172 If no file handles are given, then a Scop object with a single 173 empty root node is returned. 174 175 If a directory and version are given (with dir_path=.., version=...) or 176 file handles for each file, the whole scop tree will be built in memory. 177 178 If a MySQLdb database handle is given, the tree will be built as needed, 179 minimising construction times. To build the SQL database to the methods 180 write_xxx_sql to create the tables. 181 182 """ 183 self._sidDict = {} 184 self._sunidDict = {} 185 186 if all(h is None for h in [cla_handle, des_handle, hie_handle, dir_path, db_handle]): 187 return 188 189 if dir_path is None and db_handle is None: 190 if cla_handle is None or des_handle is None or hie_handle is None: 191 raise RuntimeError("Need CLA, DES and HIE files to build SCOP") 192 193 sunidDict = {} 194 195 self.db_handle = db_handle 196 try: 197 if db_handle: 198 # do nothing if we have a db handle, we'll do it all on the fly 199 pass 200 else: 201 # open SCOP parseable files 202 if dir_path: 203 if not version: 204 raise RuntimeError("Need SCOP version to find parsable files in directory") 205 if cla_handle or des_handle or hie_handle: 206 raise RuntimeError("Cannot specify SCOP directory and specific files") 207 208 cla_handle = _open_scop_file( dir_path, version, 'cla') 209 des_handle = _open_scop_file( dir_path, version, 'des') 210 hie_handle = _open_scop_file( dir_path, version, 'hie') 211 212 root = Node() 213 domains = [] 214 root.sunid = 0 215 root.type = 'ro' 216 sunidDict[root.sunid] = root 217 self.root = root 218 root.description = 'SCOP Root' 219 220 # Build the rest of the nodes using the DES file 221 records = Des.parse(des_handle) 222 for record in records: 223 if record.nodetype == 'px': 224 n = Domain() 225 n.sid = record.name 226 domains.append(n) 227 else : 228 n = Node() 229 n.sunid = record.sunid 230 n.type = record.nodetype 231 n.sccs = record.sccs 232 n.description = record.description 233 234 sunidDict[n.sunid] = n 235 236 # Glue all of the Nodes together using the HIE file 237 records = Hie.parse(hie_handle) 238 for record in records: 239 if record.sunid not in sunidDict: 240 print(record.sunid) 241 242 n = sunidDict[record.sunid] 243 244 if record.parent != '' : # Not root node 245 246 if record.parent not in sunidDict: 247 raise ValueError("Incomplete data?") 248 249 n.parent = sunidDict[record.parent] 250 251 for c in record.children: 252 if c not in sunidDict: 253 raise ValueError("Incomplete data?") 254 n.children.append(sunidDict[c]) 255 256 # Fill in the gaps with information from the CLA file 257 sidDict = {} 258 records = Cla.parse(cla_handle) 259 for record in records: 260 n = sunidDict[record.sunid] 261 assert n.sccs == record.sccs 262 assert n.sid == record.sid 263 n.residues = record.residues 264 sidDict[n.sid] = n 265 266 # Clean up 267 self._sunidDict = sunidDict 268 self._sidDict = sidDict 269 self._domains = tuple(domains) 270 271 finally: 272 if dir_path: 273 # If we opened the files, we close the files 274 if cla_handle: 275 cla_handle.close() 276 if des_handle: 277 des_handle.close() 278 if hie_handle: 279 hie_handle.close()
280
281 - def getRoot(self):
282 return self.getNodeBySunid(0)
283
284 - def getDomainBySid(self, sid):
285 """Return a domain from its sid""" 286 if sid in self._sidDict: 287 return self._sidDict[sid] 288 if self.db_handle: 289 self.getDomainFromSQL(sid=sid) 290 if sid in self._sidDict: 291 return self._sidDict[sid] 292 else: 293 return None
294
295 - def getNodeBySunid(self, sunid):
296 """Return a node from its sunid""" 297 if sunid in self._sunidDict: 298 return self._sunidDict[sunid] 299 if self.db_handle: 300 self.getDomainFromSQL(sunid=sunid) 301 if sunid in self._sunidDict: 302 return self._sunidDict[sunid] 303 else: 304 return None
305
306 - def getDomains(self):
307 """Returns an ordered tuple of all SCOP Domains""" 308 if self.db_handle: 309 return self.getRoot().getDescendents('px') 310 else: 311 return self._domains
312
313 - def write_hie(self, handle):
314 """Build an HIE SCOP parsable file from this object""" 315 # We order nodes to ease comparison with original file 316 for n in sorted(self._sunidDict.values(), key=lambda n: n.sunid): 317 handle.write(str(n.toHieRecord()))
318
319 - def write_des(self, handle):
320 """Build a DES SCOP parsable file from this object""" 321 # Origional SCOP file is not ordered? 322 for n in sorted(self._sunidDict.values(), key=lambda n: n.sunid): 323 if n != self.root: 324 handle.write(str(n.toDesRecord()))
325
326 - def write_cla(self, handle):
327 """Build a CLA SCOP parsable file from this object""" 328 # We order nodes to ease comparison with original file 329 for n in sorted(self._sidDict.values(), key=lambda n: n.sunid): 330 handle.write(str(n.toClaRecord()))
331
332 - def getDomainFromSQL(self, sunid=None, sid=None):
333 """Load a node from the SQL backend using sunid or sid""" 334 if sunid is None and sid is None: 335 return None 336 337 cur = self.db_handle.cursor() 338 339 if sid: 340 cur.execute("SELECT sunid FROM cla WHERE sid=%s", sid) 341 res = cur.fetchone() 342 if res is None: 343 return None 344 sunid = res[0] 345 346 cur.execute("SELECT * FROM des WHERE sunid=%s", sunid) 347 data = cur.fetchone() 348 349 if data is not None: 350 n = None 351 352 #determine if Node or Domain 353 if data[1] != "px": 354 n = Node(scop=self) 355 356 cur.execute("SELECT child FROM hie WHERE parent=%s", sunid) 357 children = [] 358 for c in cur.fetchall(): 359 children.append(c[0]) 360 n.children = children 361 else: 362 n = Domain(scop=self) 363 cur.execute("select sid, residues, pdbid from cla where sunid=%s", 364 sunid) 365 366 [n.sid, n.residues, pdbid] = cur.fetchone() 367 n.residues = Residues.Residues(n.residues) 368 n.residues.pdbid=pdbid 369 self._sidDict[n.sid] = n 370 371 [n.sunid, n.type, n.sccs, n.description] = data 372 373 if data[1] != 'ro': 374 cur.execute("SELECT parent FROM hie WHERE child=%s", sunid) 375 n.parent = cur.fetchone()[0] 376 377 n.sunid = int(n.sunid) 378 379 self._sunidDict[n.sunid] = n
380
381 - def getAscendentFromSQL(self, node, type):
382 """Get ascendents using SQL backend""" 383 if nodeCodeOrder.index(type) >= nodeCodeOrder.index(node.type): 384 return None 385 386 cur = self.db_handle.cursor() 387 cur.execute("SELECT "+type+" from cla WHERE "+node.type+"=%s", (node.sunid)) 388 result = cur.fetchone() 389 if result is not None: 390 return self.getNodeBySunid(result[0]) 391 else: 392 return None
393
394 - def getDescendentsFromSQL(self, node, type):
395 """Get descendents of a node using the database backend. This avoids 396 repeated iteration of SQL calls and is therefore much quicker than 397 repeatedly calling node.getChildren(). 398 """ 399 if nodeCodeOrder.index(type) <= nodeCodeOrder.index(node.type): 400 return [] 401 402 des_list = [] 403 404 # SQL cla table knows nothing about 'ro' 405 if node.type == 'ro': 406 for c in node.getChildren(): 407 for d in self.getDescendentsFromSQL(c, type): 408 des_list.append(d) 409 return des_list 410 411 cur = self.db_handle.cursor() 412 413 if type != 'px': 414 cur.execute("SELECT DISTINCT des.sunid,des.type,des.sccs,description FROM \ 415 cla,des WHERE cla."+node.type+"=%s AND cla."+type+"=des.sunid", (node.sunid)) 416 data = cur.fetchall() 417 for d in data: 418 if int(d[0]) not in self._sunidDict: 419 n = Node(scop=self) 420 [n.sunid, n.type, n.sccs, n.description] = d 421 n.sunid=int(n.sunid) 422 self._sunidDict[n.sunid] = n 423 424 cur.execute("SELECT parent FROM hie WHERE child=%s", n.sunid) 425 n.parent = cur.fetchone()[0] 426 427 cur.execute("SELECT child FROM hie WHERE parent=%s", n.sunid) 428 children = [] 429 for c in cur.fetchall(): 430 children.append(c[0]) 431 n.children = children 432 433 des_list.append( self._sunidDict[int(d[0])] ) 434 435 else: 436 cur.execute("SELECT cla.sunid,sid,pdbid,residues,cla.sccs,type,description,sp\ 437 FROM cla,des where cla.sunid=des.sunid and cla."+node.type+"=%s", 438 node.sunid) 439 440 data = cur.fetchall() 441 for d in data: 442 if int(d[0]) not in self._sunidDict: 443 n = Domain(scop=self) 444 #[n.sunid, n.sid, n.pdbid, n.residues, n.sccs, n.type, 445 #n.description,n.parent] = data 446 [n.sunid, n.sid, pdbid, n.residues, n.sccs, n.type, n.description, 447 n.parent] = d[0:8] 448 n.residues = Residues.Residues(n.residues) 449 n.residues.pdbid = pdbid 450 n.sunid = int(n.sunid) 451 self._sunidDict[n.sunid] = n 452 self._sidDict[n.sid] = n 453 454 des_list.append( self._sunidDict[int(d[0])] ) 455 456 return des_list
457
458 - def write_hie_sql(self, handle):
459 """Write HIE data to SQL database""" 460 cur = handle.cursor() 461 462 cur.execute("DROP TABLE IF EXISTS hie") 463 cur.execute("CREATE TABLE hie (parent INT, child INT, PRIMARY KEY (child),\ 464 INDEX (parent) )") 465 466 for p in self._sunidDict.values(): 467 for c in p.children: 468 cur.execute("INSERT INTO hie VALUES (%s,%s)" % (p.sunid, c.sunid))
469
470 - def write_cla_sql(self, handle):
471 """Write CLA data to SQL database""" 472 cur = handle.cursor() 473 474 cur.execute("DROP TABLE IF EXISTS cla") 475 cur.execute("CREATE TABLE cla (sunid INT, sid CHAR(8), pdbid CHAR(4),\ 476 residues VARCHAR(50), sccs CHAR(10), cl INT, cf INT, sf INT, fa INT,\ 477 dm INT, sp INT, px INT, PRIMARY KEY (sunid), INDEX (SID) )") 478 479 for n in self._sidDict.values(): 480 c = n.toClaRecord() 481 cur.execute( "INSERT INTO cla VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)", 482 (n.sunid, n.sid, c.residues.pdbid, c.residues, n.sccs, 483 n.getAscendent('cl').sunid, n.getAscendent('cf').sunid, 484 n.getAscendent('sf').sunid, n.getAscendent('fa').sunid, 485 n.getAscendent('dm').sunid, n.getAscendent('sp').sunid, 486 n.sunid ))
487
488 - def write_des_sql(self, handle):
489 """Write DES data to SQL database""" 490 cur = handle.cursor() 491 492 cur.execute("DROP TABLE IF EXISTS des") 493 cur.execute("CREATE TABLE des (sunid INT, type CHAR(2), sccs CHAR(10),\ 494 description VARCHAR(255),\ 495 PRIMARY KEY (sunid) )") 496 497 for n in self._sunidDict.values(): 498 cur.execute( "INSERT INTO des VALUES (%s,%s,%s,%s)", 499 ( n.sunid, n.type, n.sccs, n.description ) )
500 501
502 -class Node(object):
503 """ A node in the Scop hierarchy 504 505 sunid -- SCOP unique identifiers. e.g. '14986' 506 507 parent -- The parent node 508 509 children -- A list of child nodes 510 511 sccs -- SCOP concise classification string. e.g. 'a.1.1.2' 512 513 type -- A 2 letter node type code. e.g. 'px' for domains 514 515 description -- 516 517 """
518 - def __init__(self, scop=None):
519 """Create a Node in the scop hierarchy. If a Scop instance is provided to the 520 constructor, this will be used to lookup related references using the SQL 521 methods. If no instance is provided, it is assumed the whole tree exists 522 and is connected.""" 523 self.sunid = '' 524 self.parent = None 525 self.children = [] 526 self.sccs = '' 527 self.type = '' 528 self.description = '' 529 self.scop = scop
530
531 - def __str__(self):
532 s = [] 533 s.append(str(self.sunid)) 534 s.append(self.sccs) 535 s.append(self.type) 536 s.append(self.description) 537 538 return " ".join(s)
539
540 - def toHieRecord(self):
541 """Return an Hie.Record""" 542 rec = Hie.Record() 543 rec.sunid = str(self.sunid) 544 if self.getParent(): # Not root node 545 rec.parent = str(self.getParent().sunid) 546 else: 547 rec.parent = '-' 548 for c in self.getChildren(): 549 rec.children.append(str(c.sunid)) 550 return rec
551
552 - def toDesRecord(self):
553 """Return a Des.Record""" 554 rec = Des.Record() 555 rec.sunid = str(self.sunid) 556 rec.nodetype = self.type 557 rec.sccs = self.sccs 558 rec.description = self.description 559 return rec
560
561 - def getChildren(self):
562 """Return a list of children of this Node""" 563 if self.scop is None: 564 return self.children 565 else: 566 return [self.scop.getNodeBySunid(x) for x in self.children]
567
568 - def getParent(self):
569 """Return the parent of this Node""" 570 if self.scop is None: 571 return self.parent 572 else: 573 return self.scop.getNodeBySunid( self.parent )
574
575 - def getDescendents( self, node_type):
576 """ Return a list of all decendent nodes of the given type. Node type can a 577 two letter code or longer description. e.g. 'fa' or 'family' 578 """ 579 if node_type in _nodetype_to_code: 580 node_type = _nodetype_to_code[node_type] 581 582 nodes = [self] 583 if self.scop: 584 return self.scop.getDescendentsFromSQL(self, node_type) 585 while nodes[0].type != node_type: 586 if nodes[0].type == 'px': 587 return [] # Fell of the bottom of the hierarchy 588 child_list = [] 589 for n in nodes: 590 for child in n.getChildren(): 591 child_list.append( child ) 592 nodes = child_list 593 594 return nodes
595
596 - def getAscendent( self, node_type):
597 """ Return the ancenstor node of the given type, or None.Node type can a 598 two letter code or longer description. e.g. 'fa' or 'family'""" 599 if node_type in _nodetype_to_code: 600 node_type = _nodetype_to_code[node_type] 601 602 if self.scop: 603 return self.scop.getAscendentFromSQL(self, node_type) 604 else: 605 n = self 606 if n.type == node_type: 607 return None 608 609 while n.type != node_type: 610 if n.type == 'ro': 611 return None # Fell of the top of the hierarchy 612 n = n.getParent() 613 614 return n
615 616
617 -class Domain(Node):
618 """ A SCOP domain. A leaf node in the Scop hierarchy. 619 620 sid -- The SCOP domain identifier. e.g. 'd5hbib_' 621 622 residues -- A Residue object. It defines the collection 623 of PDB atoms that make up this domain. 624 """
625 - def __init__(self,scop=None):
626 Node.__init__(self, scop=scop) 627 self.sid = '' 628 self.residues = None
629
630 - def __str__(self):
631 s = [] 632 s.append(self.sid) 633 s.append(self.sccs) 634 s.append("("+str(self.residues)+")") 635 636 if not self.getParent(): 637 s.append(self.description) 638 else: 639 sp = self.getParent() 640 dm = sp.getParent() 641 s.append(dm.description) 642 s.append("{"+sp.description+"}") 643 644 return " ".join(s)
645
646 - def toDesRecord(self):
647 """Return a Des.Record""" 648 rec = Node.toDesRecord(self) 649 rec.name = self.sid 650 return rec
651
652 - def toClaRecord(self):
653 """Return a Cla.Record""" 654 rec = Cla.Record() 655 rec.sid = self.sid 656 rec.residues = self.residues 657 rec.sccs = self.sccs 658 rec.sunid = self.sunid 659 660 n = self 661 while n.sunid != 0: # Not root node 662 rec.hierarchy[n.type] = str(n.sunid) 663 n = n.getParent() 664 665 # Order does not matter in the hierarchy field. For more info, see 666 # http://scop.mrc-lmb.cam.ac.uk/scop/release-notes.html 667 #rec.hierarchy.reverse() 668 669 return rec
670 671
672 -class Astral(object):
673 """Abstraction of the ASTRAL database, which has sequences for all the SCOP domains, 674 as well as clusterings by percent id or evalue. 675 """ 676
677 - def __init__( self, dir_path=None, version=None, scop=None, 678 astral_file=None, db_handle=None):
679 """ 680 Initialise the astral database. 681 682 You must provide either a directory of SCOP files: 683 684 dir_path - string, the path to location of the scopseq-x.xx directory 685 (not the directory itself), and 686 version -a version number. 687 688 or, a FASTA file: 689 690 astral_file - string, a path to a fasta file (which will be loaded in memory) 691 692 or, a MYSQL database: 693 694 db_handle - a database handle for a MYSQL database containing a table 695 'astral' with the astral data in it. This can be created 696 using writeToSQL. 697 """ 698 699 if astral_file is None and dir_path is None and db_handle is None: 700 raise RuntimeError("Need either file handle, or (dir_path + " 701 + "version) or database handle to construct Astral") 702 if not scop: 703 raise RuntimeError("Must provide a Scop instance to construct") 704 705 self.scop = scop 706 self.db_handle = db_handle 707 708 if not astral_file and not db_handle: 709 if dir_path is None or version is None: 710 raise RuntimeError("must provide dir_path and version") 711 712 self.version = version 713 self.path = os.path.join( dir_path, "scopseq-%s" % version) 714 astral_file = "astral-scopdom-seqres-all-%s.fa" % self.version 715 astral_file = os.path.join(self.path, astral_file) 716 717 if astral_file: 718 #Build a dictionary of SeqRecord objects in the FASTA file, IN MEMORY 719 self.fasta_dict = SeqIO.to_dict(SeqIO.parse(astral_file, "fasta")) 720 721 self.astral_file = astral_file 722 self.EvDatasets = {} 723 self.EvDatahash = {} 724 self.IdDatasets = {} 725 self.IdDatahash = {}
726
727 - def domainsClusteredByEv(self, id):
728 """get domains clustered by evalue""" 729 if id not in self.EvDatasets: 730 if self.db_handle: 731 self.EvDatasets[id] = self.getAstralDomainsFromSQL(astralEv_to_sql[id]) 732 733 else: 734 if not self.path: 735 raise RuntimeError("No scopseq directory specified") 736 737 file_prefix = "astral-scopdom-seqres-sel-gs" 738 filename = "%s-e100m-%s-%s.id" % (file_prefix, astralEv_to_file[id], 739 self.version) 740 filename = os.path.join(self.path, filename) 741 self.EvDatasets[id] = self.getAstralDomainsFromFile(filename) 742 return self.EvDatasets[id]
743
744 - def domainsClusteredById(self, id):
745 """get domains clustered by percent id""" 746 if id not in self.IdDatasets: 747 if self.db_handle: 748 self.IdDatasets[id] = self.getAstralDomainsFromSQL("id"+str(id)) 749 else: 750 if not self.path: 751 raise RuntimeError("No scopseq directory specified") 752 753 file_prefix = "astral-scopdom-seqres-sel-gs" 754 filename = "%s-bib-%s-%s.id" % (file_prefix, id, self.version) 755 filename = os.path.join(self.path, filename) 756 self.IdDatasets[id] = self.getAstralDomainsFromFile(filename) 757 return self.IdDatasets[id]
758
759 - def getAstralDomainsFromFile(self,filename=None,file_handle=None):
760 """Get the scop domains from a file containing a list of sids""" 761 if file_handle is None and filename is None: 762 raise RuntimeError("You must provide a filename or handle") 763 if not file_handle: 764 file_handle = open(filename) 765 doms = [] 766 while True: 767 line = file_handle.readline() 768 if not line: 769 break 770 line = line.rstrip() 771 doms.append(line) 772 if filename: 773 file_handle.close() 774 775 doms = [a for a in doms if a[0]=='d'] 776 doms = [self.scop.getDomainBySid(x) for x in doms] 777 return doms
778
779 - def getAstralDomainsFromSQL(self, column):
780 """Load a set of astral domains from a column in the astral table of a MYSQL 781 database (which can be created with writeToSQL(...)""" 782 cur = self.db_handle.cursor() 783 cur.execute("SELECT sid FROM astral WHERE "+column+"=1") 784 data = cur.fetchall() 785 data = [self.scop.getDomainBySid(x[0]) for x in data] 786 787 return data
788
789 - def getSeqBySid(self, domain):
790 """get the seq record of a given domain from its sid""" 791 if self.db_handle is None: 792 return self.fasta_dict[domain].seq 793 else: 794 cur = self.db_handle.cursor() 795 cur.execute("SELECT seq FROM astral WHERE sid=%s", domain) 796 return Seq(cur.fetchone()[0])
797
798 - def getSeq(self, domain):
799 """Return seq associated with domain""" 800 return self.getSeqBySid(domain.sid)
801
802 - def hashedDomainsById(self, id):
803 """Get domains clustered by sequence identity in a dict""" 804 if id not in self.IdDatahash: 805 self.IdDatahash[id] = {} 806 for d in self.domainsClusteredById(id): 807 self.IdDatahash[id][d] = 1 808 return self.IdDatahash[id]
809
810 - def hashedDomainsByEv(self, id):
811 """Get domains clustered by evalue in a dict""" 812 if id not in self.EvDatahash: 813 self.EvDatahash[id] = {} 814 for d in self.domainsClusteredByEv(id): 815 self.EvDatahash[id][d] = 1 816 return self.EvDatahash[id]
817
818 - def isDomainInId(self, dom, id):
819 """Returns true if the domain is in the astral clusters for percent ID""" 820 return dom in self.hashedDomainsById(id)
821
822 - def isDomainInEv(self, dom, id):
823 """Returns true if the domain is in the ASTRAL clusters for evalues""" 824 return dom in self.hashedDomainsByEv(id)
825
826 - def writeToSQL(self, db_handle):
827 """Write the ASTRAL database to a MYSQL database""" 828 cur = db_handle.cursor() 829 830 cur.execute("DROP TABLE IF EXISTS astral") 831 cur.execute("CREATE TABLE astral (sid CHAR(8), seq TEXT, PRIMARY KEY (sid))") 832 833 for dom in self.fasta_dict: 834 cur.execute("INSERT INTO astral (sid,seq) values (%s,%s)", 835 (dom, self.fasta_dict[dom].seq.data)) 836 837 for i in astralBibIds: 838 cur.execute("ALTER TABLE astral ADD (id"+str(i)+" TINYINT)") 839 840 for d in self.domainsClusteredById(i): 841 cur.execute("UPDATE astral SET id"+str(i)+"=1 WHERE sid=%s", 842 d.sid) 843 844 for ev in astralEvs: 845 cur.execute("ALTER TABLE astral ADD ("+astralEv_to_sql[ev]+" TINYINT)") 846 847 for d in self.domainsClusteredByEv(ev): 848 cur.execute("UPDATE astral SET "+astralEv_to_sql[ev]+"=1 WHERE sid=%s", 849 d.sid)
850 851
852 -def search(pdb=None, key=None, sid=None, disp=None, dir=None, loc=None, 853 cgi='http://scop.mrc-lmb.cam.ac.uk/scop/search.cgi', **keywds):
854 """search(pdb=None, key=None, sid=None, disp=None, dir=None, loc=None, 855 cgi='http://scop.mrc-lmb.cam.ac.uk/scop/search.cgi', **keywds) 856 857 Access search.cgi and return a handle to the results. See the 858 online help file for an explanation of the parameters: 859 http://scop.mrc-lmb.cam.ac.uk/scop/help.html 860 861 Raises an IOError if there's a network error. 862 863 """ 864 params = {'pdb' : pdb, 'key' : key, 'sid' : sid, 'disp' : disp, 865 'dir' : dir, 'loc' : loc} 866 variables = {} 867 for k, v in params.items(): 868 if v is not None: 869 variables[k] = v 870 variables.update(keywds) 871 return _open(cgi, variables)
872 873
874 -def _open(cgi, params={}, get=1):
875 """_open(cgi, params={}, get=1) -> UndoHandle 876 877 Open a handle to SCOP. cgi is the URL for the cgi script to access. 878 params is a dictionary with the options to pass to it. get is a boolean 879 that describes whether a GET should be used. Does some 880 simple error checking, and will raise an IOError if it encounters one. 881 882 """ 883 from Bio._py3k import urlopen, urlencode 884 885 # Open a handle to SCOP. 886 options = urlencode(params) 887 if get: # do a GET 888 if options: 889 cgi += "?" + options 890 handle = urlopen(cgi) 891 else: # do a POST 892 handle = urlopen(cgi, data=options) 893 return handle
894