Package Bio :: Package PDB :: Module PDBList'
[hide private]
[frames] | no frames]

Source Code for Module Bio.PDB.PDBList'

  1  #!/usr/bin/env python 
  2  # 
  3  # PDBList.py 
  4  # 
  5  # A tool for tracking changes in the PDB Protein Structure Database. 
  6  # 
  7  # (c) 2003 Kristian Rother 
  8  # This work was supported by the German Ministry of Education 
  9  # and Research (BMBF). Project http://www.bcbio.de 
 10  # 
 11  # Contact the author 
 12  #    homepage : http://www.rubor.de/bioinf 
 13  #    email    : krother@genesilico.pl 
 14  # 
 15  # 
 16  # This code is released under the conditions of the Biopython license. 
 17  # It may be distributed freely with respect to the original author. 
 18  # Any maintainer of the Biopython code may change this notice 
 19  # when appropriate. 
 20   
 21  """ Access the PDB over the internet (e.g. to download structures). """ 
 22   
 23  from __future__ import print_function 
 24   
 25  import contextlib 
 26  import gzip 
 27  import os 
 28  import shutil 
 29   
 30  #Importing these functions with leading underscore as not intended for reuse 
 31  from Bio._py3k import urlopen as _urlopen 
 32  from Bio._py3k import urlretrieve as _urlretrieve 
 33   
 34   
35 -class PDBList(object):
36 """ 37 This class provides quick access to the structure lists on the 38 PDB server or its mirrors. The structure lists contain 39 four-letter PDB codes, indicating that structures are 40 new, have been modified or are obsolete. The lists are released 41 on a weekly basis. 42 43 It also provides a function to retrieve PDB files from the server. 44 To use it properly, prepare a directory /pdb or the like, 45 where PDB files are stored. 46 47 If you want to use this module from inside a proxy, add 48 the proxy variable to your environment, e.g. in Unix: 49 export HTTP_PROXY='http://realproxy.charite.de:888' 50 (This can also be added to ~/.bashrc) 51 """ 52 53 PDB_REF = """ 54 The Protein Data Bank: a computer-based archival file for macromolecular structures. 55 F.C.Bernstein, T.F.Koetzle, G.J.B.Williams, E.F.Meyer Jr, M.D.Brice, J.R.Rodgers, O.Kennard, T.Shimanouchi, M.Tasumi 56 J. Mol. Biol. 112 pp. 535-542 (1977) 57 http://www.pdb.org/. 58 """ 59 60 alternative_download_url = "http://www.rcsb.org/pdb/files/" 61 # just append PDB code to this, and then it works. 62
63 - def __init__(self, server='ftp://ftp.wwpdb.org', pdb=os.getcwd(), 64 obsolete_pdb=None):
65 """Initialize the class with the default server or a custom one.""" 66 self.pdb_server = server # remote pdb server 67 self.local_pdb = pdb # local pdb file tree 68 69 # local file tree for obsolete pdb files 70 if obsolete_pdb: 71 self.obsolete_pdb = obsolete_pdb 72 else: 73 self.obsolete_pdb = os.path.join(self.local_pdb, 'obsolete') 74 if not os.access(self.obsolete_pdb, os.F_OK): 75 os.makedirs(self.obsolete_pdb) 76 77 # variables for command-line options 78 self.overwrite = 0 79 self.flat_tree = 0
80
81 - def get_status_list(self, url):
82 """Retrieves a list of pdb codes in the weekly pdb status file 83 from the given URL. Used by get_recent_files. 84 85 Typical contents of the list files parsed by this method is now 86 very simply one PDB name per line. 87 """ 88 with contextlib.closing(_urlopen(url)) as handle: 89 answer = [] 90 for line in handle: 91 pdb = line.strip() 92 assert len(pdb) == 4 93 answer.append(pdb) 94 return answer
95
96 - def get_recent_changes(self):
97 """Returns three lists of the newest weekly files (added,mod,obsolete). 98 99 Reads the directories with changed entries from the PDB server and 100 returns a tuple of three URL's to the files of new, modified and 101 obsolete entries from the most recent list. The directory with the 102 largest numerical name is used. 103 Returns None if something goes wrong. 104 105 Contents of the data/status dir (20031013 would be used); 106 drwxrwxr-x 2 1002 sysadmin 512 Oct 6 18:28 20031006 107 drwxrwxr-x 2 1002 sysadmin 512 Oct 14 02:14 20031013 108 -rw-r--r-- 1 1002 sysadmin 1327 Mar 12 2001 README 109 """ 110 url = self.pdb_server + '/pub/pdb/data/status/' 111 with contextlib.closing(_urlopen(url)) as handle: 112 recent = filter(str.isdigit, 113 (x.split()[-1] for x in handle.readlines()) 114 )[-1] 115 116 path = self.pdb_server + '/pub/pdb/data/status/%s/' % (recent) 117 118 # Retrieve the lists 119 added = self.get_status_list(path + 'added.pdb') 120 modified = self.get_status_list(path + 'modified.pdb') 121 obsolete = self.get_status_list(path + 'obsolete.pdb') 122 return [added, modified, obsolete]
123
124 - def get_all_entries(self):
125 """Retrieves a big file containing all the 126 PDB entries and some annotation to them. 127 Returns a list of PDB codes in the index file. 128 """ 129 print("retrieving index file. Takes about 5 MB.") 130 url = self.pdb_server + '/pub/pdb/derived_data/index/entries.idx' 131 with contextlib.closing(_urlopen(url)) as handle: 132 all_entries = [line[:4] for line in handle.readlines()[2:] 133 if len(line) > 4] 134 return all_entries
135
136 - def get_all_obsolete(self):
137 """Returns a list of all obsolete entries ever in the PDB. 138 139 Returns a list of all obsolete pdb codes that have ever been 140 in the PDB. 141 142 Gets and parses the file from the PDB server in the format 143 (the first pdb_code column is the one used). The file looks 144 like this: 145 146 LIST OF OBSOLETE COORDINATE ENTRIES AND SUCCESSORS 147 OBSLTE 31-JUL-94 116L 216L 148 ... 149 OBSLTE 29-JAN-96 1HFT 2HFT 150 OBSLTE 21-SEP-06 1HFV 2J5X 151 OBSLTE 21-NOV-03 1HG6 152 OBSLTE 18-JUL-84 1HHB 2HHB 3HHB 153 OBSLTE 08-NOV-96 1HID 2HID 154 OBSLTE 01-APR-97 1HIU 2HIU 155 OBSLTE 14-JAN-04 1HKE 1UUZ 156 ... 157 158 """ 159 url = self.pdb_server + '/pub/pdb/data/status/obsolete.dat' 160 with contextlib.closing(_urlopen(url)) as handle: 161 # Extract pdb codes. Could use a list comprehension, but I want 162 # to include an assert to check for mis-reading the data. 163 obsolete = [] 164 for line in handle: 165 if not line.startswith("OBSLTE "): 166 continue 167 pdb = line.split()[2] 168 assert len(pdb) == 4 169 obsolete.append(pdb) 170 return obsolete
171
172 - def retrieve_pdb_file(self, pdb_code, obsolete=False, pdir=None):
173 """ Retrieves a PDB structure file from the PDB server and 174 stores it in a local file tree. 175 176 The PDB structure's file name is returned as a single string. 177 If obsolete == True, the file will be saved in a special file tree. 178 179 @param pdir: put the file in this directory (default: create a PDB-style directory tree) 180 @type pdir: string 181 182 @return: filename 183 @rtype: string 184 """ 185 # Get the compressed PDB structure 186 code = pdb_code.lower() 187 archive_fn = "pdb%s.ent.gz" % code 188 pdb_dir = "divided" if not obsolete else "obsolete" 189 url = (self.pdb_server + 190 '/pub/pdb/data/structures/%s/pdb/%s/%s' % 191 (pdb_dir, code[1:3], archive_fn)) 192 193 # Where does the final PDB file get saved? 194 if pdir is None: 195 path = self.local_pdb if not obsolete else self.obsolete_pdb 196 if not self.flat_tree: # Put in PDB-style directory tree 197 path = os.path.join(path, code[1:3]) 198 else: # Put in specified directory 199 path = pdir 200 if not os.access(path, os.F_OK): 201 os.makedirs(path) 202 203 filename = os.path.join(path, archive_fn) 204 final_file = os.path.join(path, "pdb%s.ent" % code) # (decompressed) 205 206 # Skip download if the file already exists 207 if not self.overwrite: 208 if os.path.exists(final_file): 209 print("Structure exists: '%s' " % final_file) 210 return final_file 211 212 # Retrieve the file 213 print("Downloading PDB structure '%s'..." % pdb_code) 214 _urlretrieve(url, filename) 215 216 # Uncompress the archive, delete when done 217 #Can't use context manager with gzip.open until Python 2.7 218 gz = gzip.open(filename, 'rb') 219 with open(final_file, 'wb') as out: 220 out.writelines(gz) 221 gz.close() 222 os.remove(filename) 223 224 return final_file
225
226 - def update_pdb(self):
227 """ 228 I guess this is the 'most wanted' function from this module. 229 It gets the weekly lists of new and modified pdb entries and 230 automatically downloads the according PDB files. 231 You can call this module as a weekly cronjob. 232 """ 233 assert os.path.isdir(self.local_pdb) 234 assert os.path.isdir(self.obsolete_pdb) 235 236 new, modified, obsolete = self.get_recent_changes() 237 238 for pdb_code in new + modified: 239 try: 240 self.retrieve_pdb_file(pdb_code) 241 except Exception: 242 print('error %s\n' % pdb_code) 243 # you can insert here some more log notes that 244 # something has gone wrong. 245 246 # Move the obsolete files to a special folder 247 for pdb_code in obsolete: 248 if self.flat_tree: 249 old_file = os.path.join(self.local_pdb, 250 'pdb%s.ent' % pdb_code) 251 new_dir = self.obsolete_pdb 252 else: 253 old_file = os.path.join(self.local_pdb, pdb_code[1:3], 254 'pdb%s.ent' % pdb_code) 255 new_dir = os.path.join(self.obsolete_pdb, pdb_code[1:3]) 256 new_file = os.path.join(new_dir, 'pdb%s.ent' % pdb_code) 257 if os.path.isfile(old_file): 258 if not os.path.isdir(new_dir): 259 os.mkdir(new_dir) 260 try: 261 shutil.move(old_file, new_file) 262 except Exception: 263 print("Could not move %s to obsolete folder" % old_file) 264 elif os.path.isfile(new_file): 265 print("Obsolete file %s already moved" % old_file) 266 else: 267 print("Obsolete file %s is missing" % old_file)
268
269 - def download_entire_pdb(self, listfile=None):
270 """Retrieve all PDB entries not present in the local PDB copy. 271 272 Writes a list file containing all PDB codes (optional, if listfile is 273 given). 274 """ 275 entries = self.get_all_entries() 276 for pdb_code in entries: 277 self.retrieve_pdb_file(pdb_code) 278 # Write the list 279 if listfile: 280 with open(listfile, 'w') as outfile: 281 outfile.writelines((x + '\n' for x in entries))
282
283 - def download_obsolete_entries(self, listfile=None):
284 """Retrieve all obsolete PDB entries not present in the local obsolete 285 PDB copy. 286 287 Writes a list file containing all PDB codes (optional, if listfile is 288 given). 289 """ 290 entries = self.get_all_obsolete() 291 for pdb_code in entries: 292 self.retrieve_pdb_file(pdb_code, obsolete=1) 293 294 # Write the list 295 if listfile: 296 with open(listfile, 'w') as outfile: 297 outfile.writelines((x + '\n' for x in entries))
298
299 - def get_seqres_file(self, savefile='pdb_seqres.txt'):
300 """Retrieves a (big) file containing all the sequences of PDB entries 301 and writes it to a file. 302 """ 303 print("Retrieving sequence file (takes about 15 MB).") 304 url = self.pdb_server + '/pub/pdb/derived_data/pdb_seqres.txt' 305 _urlretrieve(url, savefile)
306 307 308 if __name__ == '__main__': 309 310 import sys 311 312 doc = """PDBList.py 313 (c) Kristian Rother 2003, Contributed to BioPython 314 315 Usage: 316 PDBList.py update <pdb_path> [options] - write weekly PDB updates to 317 local pdb tree. 318 PDBList.py all <pdb_path> [options] - write all PDB entries to 319 local pdb tree. 320 PDBList.py obsol <pdb_path> [options] - write all obsolete PDB 321 entries to local pdb tree. 322 PDBList.py <PDB-ID> <pdb_path> [options] - retrieve single structure 323 324 Options: 325 -d A single directory will be used as <pdb_path>, not a tree. 326 -o Overwrite existing structure files. 327 """ 328 print(doc) 329 330 if len(sys.argv) > 2: 331 pdb_path = sys.argv[2] 332 pl = PDBList(pdb=pdb_path) 333 if len(sys.argv) > 3: 334 for option in sys.argv[3:]: 335 if option == '-d': 336 pl.flat_tree = 1 337 elif option == '-o': 338 pl.overwrite = 1 339 340 else: 341 pdb_path = os.getcwd() 342 pl = PDBList() 343 pl.flat_tree = 1 344 345 if len(sys.argv) > 1: 346 if sys.argv[1] == 'update': 347 # update PDB 348 print("updating local PDB at " + pdb_path) 349 pl.update_pdb() 350 351 elif sys.argv[1] == 'all': 352 # get the entire PDB 353 pl.download_entire_pdb() 354 355 elif sys.argv[1] == 'obsol': 356 # get all obsolete entries 357 pl.download_obsolete_entries(pdb_path) 358 359 elif len(sys.argv[1]) == 4 and sys.argv[1][0].isdigit(): 360 # get single PDB entry 361 pl.retrieve_pdb_file(sys.argv[1], pdir=pdb_path) 362