Package Bio :: Package PDB :: Module PDBList'
[hide private]
[frames] | no frames]

Source Code for Module Bio.PDB.PDBList'

  1  #!/usr/bin/env python 
  2  # 
  3  # PDBList.py 
  4  # 
  5  # A tool for tracking changes in the PDB Protein Structure Database. 
  6  # 
  7  # (c) 2003 Kristian Rother 
  8  # This work was supported by the German Ministry of Education 
  9  # and Research (BMBF). Project http://www.bcbio.de 
 10  # 
 11  # Contact the author 
 12  #    homepage : http://www.rubor.de/bioinf 
 13  #    email    : krother@genesilico.pl 
 14  # 
 15  # 
 16  # This code is released under the conditions of the Biopython license. 
 17  # It may be distributed freely with respect to the original author. 
 18  # Any maintainer of the Biopython code may change this notice 
 19  # when appropriate. 
 20   
 21  """ Access the PDB over the internet (e.g. to download structures). """ 
 22   
 23  # For using with statement in Python 2.5 or Jython 
 24  from __future__ import with_statement 
 25   
 26  import contextlib 
 27  import gzip 
 28  import os 
 29  import shutil 
 30  import urllib 
 31  from urllib2 import urlopen as _urlopen  # urllib made too many FTP conn's 
 32   
 33   
34 -class PDBList(object):
35 """ 36 This class provides quick access to the structure lists on the 37 PDB server or its mirrors. The structure lists contain 38 four-letter PDB codes, indicating that structures are 39 new, have been modified or are obsolete. The lists are released 40 on a weekly basis. 41 42 It also provides a function to retrieve PDB files from the server. 43 To use it properly, prepare a directory /pdb or the like, 44 where PDB files are stored. 45 46 If you want to use this module from inside a proxy, add 47 the proxy variable to your environment, e.g. in Unix: 48 export HTTP_PROXY='http://realproxy.charite.de:888' 49 (This can also be added to ~/.bashrc) 50 """ 51 52 PDB_REF = """ 53 The Protein Data Bank: a computer-based archival file for macromolecular structures. 54 F.C.Bernstein, T.F.Koetzle, G.J.B.Williams, E.F.Meyer Jr, M.D.Brice, J.R.Rodgers, O.Kennard, T.Shimanouchi, M.Tasumi 55 J. Mol. Biol. 112 pp. 535-542 (1977) 56 http://www.pdb.org/. 57 """ 58 59 alternative_download_url = "http://www.rcsb.org/pdb/files/" 60 # just append PDB code to this, and then it works. 61
62 - def __init__(self, server='ftp://ftp.wwpdb.org', pdb=os.getcwd(), 63 obsolete_pdb=None):
64 """Initialize the class with the default server or a custom one.""" 65 self.pdb_server = server # remote pdb server 66 self.local_pdb = pdb # local pdb file tree 67 68 # local file tree for obsolete pdb files 69 if obsolete_pdb: 70 self.obsolete_pdb = obsolete_pdb 71 else: 72 self.obsolete_pdb = os.path.join(self.local_pdb, 'obsolete') 73 if not os.access(self.obsolete_pdb, os.F_OK): 74 os.makedirs(self.obsolete_pdb) 75 76 # variables for command-line options 77 self.overwrite = 0 78 self.flat_tree = 0
79
80 - def get_status_list(self, url):
81 """Retrieves a list of pdb codes in the weekly pdb status file 82 from the given URL. Used by get_recent_files. 83 84 Typical contents of the list files parsed by this method is now 85 very simply one PDB name per line. 86 """ 87 with contextlib.closing(_urlopen(url)) as handle: 88 answer = [] 89 for line in handle: 90 pdb = line.strip() 91 assert len(pdb) == 4 92 answer.append(pdb) 93 return answer
94
95 - def get_recent_changes(self):
96 """Returns three lists of the newest weekly files (added,mod,obsolete). 97 98 Reads the directories with changed entries from the PDB server and 99 returns a tuple of three URL's to the files of new, modified and 100 obsolete entries from the most recent list. The directory with the 101 largest numerical name is used. 102 Returns None if something goes wrong. 103 104 Contents of the data/status dir (20031013 would be used); 105 drwxrwxr-x 2 1002 sysadmin 512 Oct 6 18:28 20031006 106 drwxrwxr-x 2 1002 sysadmin 512 Oct 14 02:14 20031013 107 -rw-r--r-- 1 1002 sysadmin 1327 Mar 12 2001 README 108 """ 109 url = self.pdb_server + '/pub/pdb/data/status/' 110 with contextlib.closing(_urlopen(url)) as handle: 111 recent = filter(str.isdigit, 112 (x.split()[-1] for x in handle.readlines()) 113 )[-1] 114 115 path = self.pdb_server + '/pub/pdb/data/status/%s/' % (recent) 116 117 # Retrieve the lists 118 added = self.get_status_list(path + 'added.pdb') 119 modified = self.get_status_list(path + 'modified.pdb') 120 obsolete = self.get_status_list(path + 'obsolete.pdb') 121 return [added, modified, obsolete]
122
123 - def get_all_entries(self):
124 """Retrieves a big file containing all the 125 PDB entries and some annotation to them. 126 Returns a list of PDB codes in the index file. 127 """ 128 print "retrieving index file. Takes about 5 MB." 129 url = self.pdb_server + '/pub/pdb/derived_data/index/entries.idx' 130 with contextlib.closing(_urlopen(url)) as handle: 131 all_entries = [line[:4] for line in handle.readlines()[2:] 132 if len(line) > 4] 133 return all_entries
134
135 - def get_all_obsolete(self):
136 """Returns a list of all obsolete entries ever in the PDB. 137 138 Returns a list of all obsolete pdb codes that have ever been 139 in the PDB. 140 141 Gets and parses the file from the PDB server in the format 142 (the first pdb_code column is the one used). The file looks 143 like this: 144 145 LIST OF OBSOLETE COORDINATE ENTRIES AND SUCCESSORS 146 OBSLTE 31-JUL-94 116L 216L 147 ... 148 OBSLTE 29-JAN-96 1HFT 2HFT 149 OBSLTE 21-SEP-06 1HFV 2J5X 150 OBSLTE 21-NOV-03 1HG6 151 OBSLTE 18-JUL-84 1HHB 2HHB 3HHB 152 OBSLTE 08-NOV-96 1HID 2HID 153 OBSLTE 01-APR-97 1HIU 2HIU 154 OBSLTE 14-JAN-04 1HKE 1UUZ 155 ... 156 157 """ 158 url = self.pdb_server + '/pub/pdb/data/status/obsolete.dat' 159 with contextlib.closing(_urlopen(url)) as handle: 160 # Extract pdb codes. Could use a list comprehension, but I want 161 # to include an assert to check for mis-reading the data. 162 obsolete = [] 163 for line in handle: 164 if not line.startswith("OBSLTE "): 165 continue 166 pdb = line.split()[2] 167 assert len(pdb) == 4 168 obsolete.append(pdb) 169 return obsolete
170
171 - def retrieve_pdb_file(self, pdb_code, obsolete=False, pdir=None):
172 """ Retrieves a PDB structure file from the PDB server and 173 stores it in a local file tree. 174 175 The PDB structure's file name is returned as a single string. 176 If obsolete == True, the file will be saved in a special file tree. 177 178 @param pdir: put the file in this directory (default: create a PDB-style directory tree) 179 @type pdir: string 180 181 @return: filename 182 @rtype: string 183 """ 184 # Get the compressed PDB structure 185 code = pdb_code.lower() 186 archive_fn = "pdb%s.ent.gz" % code 187 pdb_dir = "divided" if not obsolete else "obsolete" 188 url = (self.pdb_server + 189 '/pub/pdb/data/structures/%s/pdb/%s/%s' % 190 (pdb_dir, code[1:3], archive_fn)) 191 192 # Where does the final PDB file get saved? 193 if pdir is None: 194 path = self.local_pdb if not obsolete else self.obsolete_pdb 195 if not self.flat_tree: # Put in PDB-style directory tree 196 path = os.path.join(path, code[1:3]) 197 else: # Put in specified directory 198 path = pdir 199 if not os.access(path, os.F_OK): 200 os.makedirs(path) 201 202 filename = os.path.join(path, archive_fn) 203 final_file = os.path.join(path, "pdb%s.ent" % code) # (decompressed) 204 205 # Skip download if the file already exists 206 if not self.overwrite: 207 if os.path.exists(final_file): 208 print "Structure exists: '%s' " % final_file 209 return final_file 210 211 # Retrieve the file 212 print "Downloading PDB structure '%s'..." % pdb_code 213 urllib.urlretrieve(url, filename) 214 215 # Uncompress the archive, delete when done 216 with gzip.open(filename, 'rb') as gz: 217 with open(final_file, 'wb') as out: 218 out.writelines(gz) 219 os.remove(filename) 220 221 return final_file
222
223 - def update_pdb(self):
224 """ 225 I guess this is the 'most wanted' function from this module. 226 It gets the weekly lists of new and modified pdb entries and 227 automatically downloads the according PDB files. 228 You can call this module as a weekly cronjob. 229 """ 230 assert os.path.isdir(self.local_pdb) 231 assert os.path.isdir(self.obsolete_pdb) 232 233 new, modified, obsolete = self.get_recent_changes() 234 235 for pdb_code in new + modified: 236 try: 237 self.retrieve_pdb_file(pdb_code) 238 except Exception: 239 print 'error %s\n' % pdb_code 240 # you can insert here some more log notes that 241 # something has gone wrong. 242 243 # Move the obsolete files to a special folder 244 for pdb_code in obsolete: 245 if self.flat_tree: 246 old_file = os.path.join(self.local_pdb, 247 'pdb%s.ent' % pdb_code) 248 new_dir = self.obsolete_pdb 249 else: 250 old_file = os.path.join(self.local_pdb, pdb_code[1:3], 251 'pdb%s.ent' % pdb_code) 252 new_dir = os.path.join(self.obsolete_pdb, pdb_code[1:3]) 253 new_file = os.path.join(new_dir, 'pdb%s.ent' % pdb_code) 254 if os.path.isfile(old_file): 255 if not os.path.isdir(new_dir): 256 os.mkdir(new_dir) 257 try: 258 shutil.move(old_file, new_file) 259 except Exception: 260 print "Could not move %s to obsolete folder" % old_file 261 elif os.path.isfile(new_file): 262 print "Obsolete file %s already moved" % old_file 263 else: 264 print "Obsolete file %s is missing" % old_file
265
266 - def download_entire_pdb(self, listfile=None):
267 """Retrieve all PDB entries not present in the local PDB copy. 268 269 Writes a list file containing all PDB codes (optional, if listfile is 270 given). 271 """ 272 entries = self.get_all_entries() 273 for pdb_code in entries: 274 self.retrieve_pdb_file(pdb_code) 275 # Write the list 276 if listfile: 277 with open(listfile, 'w') as outfile: 278 outfile.writelines((x + '\n' for x in entries))
279
280 - def download_obsolete_entries(self, listfile=None):
281 """Retrieve all obsolete PDB entries not present in the local obsolete 282 PDB copy. 283 284 Writes a list file containing all PDB codes (optional, if listfile is 285 given). 286 """ 287 entries = self.get_all_obsolete() 288 for pdb_code in entries: 289 self.retrieve_pdb_file(pdb_code, obsolete=1) 290 291 # Write the list 292 if listfile: 293 with open(listfile, 'w') as outfile: 294 outfile.writelines((x + '\n' for x in entries))
295
296 - def get_seqres_file(self, savefile='pdb_seqres.txt'):
297 """Retrieves a (big) file containing all the sequences of PDB entries 298 and writes it to a file. 299 """ 300 print "Retrieving sequence file (takes about 15 MB)." 301 url = self.pdb_server + '/pub/pdb/derived_data/pdb_seqres.txt' 302 urllib.urlretrieve(url, savefile)
303 304 305 if __name__ == '__main__': 306 307 import sys 308 309 doc = """PDBList.py 310 (c) Kristian Rother 2003, Contributed to BioPython 311 312 Usage: 313 PDBList.py update <pdb_path> [options] - write weekly PDB updates to 314 local pdb tree. 315 PDBList.py all <pdb_path> [options] - write all PDB entries to 316 local pdb tree. 317 PDBList.py obsol <pdb_path> [options] - write all obsolete PDB 318 entries to local pdb tree. 319 PDBList.py <PDB-ID> <pdb_path> [options] - retrieve single structure 320 321 Options: 322 -d A single directory will be used as <pdb_path>, not a tree. 323 -o Overwrite existing structure files. 324 """ 325 print doc 326 327 if len(sys.argv) > 2: 328 pdb_path = sys.argv[2] 329 pl = PDBList(pdb=pdb_path) 330 if len(sys.argv) > 3: 331 for option in sys.argv[3:]: 332 if option == '-d': 333 pl.flat_tree = 1 334 elif option == '-o': 335 pl.overwrite = 1 336 337 else: 338 pdb_path = os.getcwd() 339 pl = PDBList() 340 pl.flat_tree = 1 341 342 if len(sys.argv) > 1: 343 if sys.argv[1] == 'update': 344 # update PDB 345 print "updating local PDB at " + pdb_path 346 pl.update_pdb() 347 348 elif sys.argv[1] == 'all': 349 # get the entire PDB 350 pl.download_entire_pdb() 351 352 elif sys.argv[1] == 'obsol': 353 # get all obsolete entries 354 pl.download_obsolete_entries(pdb_path) 355 356 elif len(sys.argv[1]) == 4 and sys.argv[1][0].isdigit(): 357 # get single PDB entry 358 pl.retrieve_pdb_file(sys.argv[1], pdir=pdb_path) 359