Package Bio :: Package PDB :: Module PDBList'
[hide private]
[frames] | no frames]

Source Code for Module Bio.PDB.PDBList'

  1  #!/usr/bin/env python 
  2  # 
  3  # PDBList.py 
  4  # 
  5  # A tool for tracking changes in the PDB Protein Structure Database. 
  6  # 
  7  # (c) 2003 Kristian Rother 
  8  # This work was supported by the German Ministry of Education 
  9  # and Research (BMBF). Project http://www.bcbio.de 
 10  # 
 11  # Contact the author 
 12  #    homepage : http://www.rubor.de/bioinf 
 13  #    email    : krother@genesilico.pl 
 14  # 
 15  # 
 16  # This code is released under the conditions of the Biopython license. 
 17  # It may be distributed freely with respect to the original author. 
 18  # Any maintainer of the Biopython code may change this notice 
 19  # when appropriate. 
 20   
 21  """ Access the PDB over the internet (e.g. to download structures). """ 
 22   
 23  from __future__ import print_function 
 24   
 25  import contextlib 
 26  import gzip 
 27  import os 
 28  import shutil 
 29   
 30  # Importing these functions with leading underscore as not intended for reuse 
 31  from Bio._py3k import urlopen as _urlopen 
 32  from Bio._py3k import urlretrieve as _urlretrieve 
 33   
 34  __docformat__ = "restructuredtext en" 
 35   
36 -class PDBList(object):
37 """ 38 This class provides quick access to the structure lists on the 39 PDB server or its mirrors. The structure lists contain 40 four-letter PDB codes, indicating that structures are 41 new, have been modified or are obsolete. The lists are released 42 on a weekly basis. 43 44 It also provides a function to retrieve PDB files from the server. 45 To use it properly, prepare a directory /pdb or the like, 46 where PDB files are stored. 47 48 If you want to use this module from inside a proxy, add 49 the proxy variable to your environment, e.g. in Unix: 50 export HTTP_PROXY='http://realproxy.charite.de:888' 51 (This can also be added to ~/.bashrc) 52 """ 53 54 PDB_REF = """ 55 The Protein Data Bank: a computer-based archival file for macromolecular structures. 56 F.C.Bernstein, T.F.Koetzle, G.J.B.Williams, E.F.Meyer Jr, M.D.Brice, J.R.Rodgers, O.Kennard, T.Shimanouchi, M.Tasumi 57 J. Mol. Biol. 112 pp. 535-542 (1977) 58 http://www.pdb.org/. 59 """ 60 61 alternative_download_url = "http://www.rcsb.org/pdb/files/" 62 # just append PDB code to this, and then it works. 63
64 - def __init__(self, server='ftp://ftp.wwpdb.org', pdb=os.getcwd(), 65 obsolete_pdb=None):
66 """Initialize the class with the default server or a custom one.""" 67 self.pdb_server = server # remote pdb server 68 self.local_pdb = pdb # local pdb file tree 69 70 # local file tree for obsolete pdb files 71 if obsolete_pdb: 72 self.obsolete_pdb = obsolete_pdb 73 else: 74 self.obsolete_pdb = os.path.join(self.local_pdb, 'obsolete') 75 if not os.access(self.obsolete_pdb, os.F_OK): 76 os.makedirs(self.obsolete_pdb) 77 78 # variables for command-line options 79 self.overwrite = 0 80 self.flat_tree = 0
81
82 - def get_status_list(self, url):
83 """Retrieves a list of pdb codes in the weekly pdb status file 84 from the given URL. Used by get_recent_files. 85 86 Typical contents of the list files parsed by this method is now 87 very simply one PDB name per line. 88 """ 89 with contextlib.closing(_urlopen(url)) as handle: 90 answer = [] 91 for line in handle: 92 pdb = line.strip() 93 assert len(pdb) == 4 94 answer.append(pdb) 95 return answer
96
97 - def get_recent_changes(self):
98 """Returns three lists of the newest weekly files (added,mod,obsolete). 99 100 Reads the directories with changed entries from the PDB server and 101 returns a tuple of three URL's to the files of new, modified and 102 obsolete entries from the most recent list. The directory with the 103 largest numerical name is used. 104 Returns None if something goes wrong. 105 106 Contents of the data/status dir (20031013 would be used); 107 drwxrwxr-x 2 1002 sysadmin 512 Oct 6 18:28 20031006 108 drwxrwxr-x 2 1002 sysadmin 512 Oct 14 02:14 20031013 109 -rw-r--r-- 1 1002 sysadmin 1327 Mar 12 2001 README 110 """ 111 url = self.pdb_server + '/pub/pdb/data/status/' 112 with contextlib.closing(_urlopen(url)) as handle: 113 recent = filter(str.isdigit, 114 (x.split()[-1] for x in handle.readlines()) 115 )[-1] 116 117 path = self.pdb_server + '/pub/pdb/data/status/%s/' % (recent) 118 119 # Retrieve the lists 120 added = self.get_status_list(path + 'added.pdb') 121 modified = self.get_status_list(path + 'modified.pdb') 122 obsolete = self.get_status_list(path + 'obsolete.pdb') 123 return [added, modified, obsolete]
124
125 - def get_all_entries(self):
126 """Retrieves a big file containing all the 127 PDB entries and some annotation to them. 128 Returns a list of PDB codes in the index file. 129 """ 130 print("retrieving index file. Takes about 5 MB.") 131 url = self.pdb_server + '/pub/pdb/derived_data/index/entries.idx' 132 with contextlib.closing(_urlopen(url)) as handle: 133 all_entries = [line[:4] for line in handle.readlines()[2:] 134 if len(line) > 4] 135 return all_entries
136
137 - def get_all_obsolete(self):
138 """Returns a list of all obsolete entries ever in the PDB. 139 140 Returns a list of all obsolete pdb codes that have ever been 141 in the PDB. 142 143 Gets and parses the file from the PDB server in the format 144 (the first pdb_code column is the one used). The file looks 145 like this:: 146 147 LIST OF OBSOLETE COORDINATE ENTRIES AND SUCCESSORS 148 OBSLTE 31-JUL-94 116L 216L 149 ... 150 OBSLTE 29-JAN-96 1HFT 2HFT 151 OBSLTE 21-SEP-06 1HFV 2J5X 152 OBSLTE 21-NOV-03 1HG6 153 OBSLTE 18-JUL-84 1HHB 2HHB 3HHB 154 OBSLTE 08-NOV-96 1HID 2HID 155 OBSLTE 01-APR-97 1HIU 2HIU 156 OBSLTE 14-JAN-04 1HKE 1UUZ 157 ... 158 159 """ 160 url = self.pdb_server + '/pub/pdb/data/status/obsolete.dat' 161 with contextlib.closing(_urlopen(url)) as handle: 162 # Extract pdb codes. Could use a list comprehension, but I want 163 # to include an assert to check for mis-reading the data. 164 obsolete = [] 165 for line in handle: 166 if not line.startswith("OBSLTE "): 167 continue 168 pdb = line.split()[2] 169 assert len(pdb) == 4 170 obsolete.append(pdb) 171 return obsolete
172
173 - def retrieve_pdb_file(self, pdb_code, obsolete=False, pdir=None):
174 """ Retrieves a PDB structure file from the PDB server and 175 stores it in a local file tree. 176 177 The PDB structure's file name is returned as a single string. 178 If obsolete ``==`` True, the file will be saved in a special file tree. 179 180 @param pdir: put the file in this directory (default: create a PDB-style directory tree) 181 @type pdir: string 182 183 @return: filename 184 @rtype: string 185 """ 186 # Get the compressed PDB structure 187 code = pdb_code.lower() 188 archive_fn = "pdb%s.ent.gz" % code 189 pdb_dir = "divided" if not obsolete else "obsolete" 190 url = (self.pdb_server + 191 '/pub/pdb/data/structures/%s/pdb/%s/%s' % 192 (pdb_dir, code[1:3], archive_fn)) 193 194 # Where does the final PDB file get saved? 195 if pdir is None: 196 path = self.local_pdb if not obsolete else self.obsolete_pdb 197 if not self.flat_tree: # Put in PDB-style directory tree 198 path = os.path.join(path, code[1:3]) 199 else: # Put in specified directory 200 path = pdir 201 if not os.access(path, os.F_OK): 202 os.makedirs(path) 203 204 filename = os.path.join(path, archive_fn) 205 final_file = os.path.join(path, "pdb%s.ent" % code) # (decompressed) 206 207 # Skip download if the file already exists 208 if not self.overwrite: 209 if os.path.exists(final_file): 210 print("Structure exists: '%s' " % final_file) 211 return final_file 212 213 # Retrieve the file 214 print("Downloading PDB structure '%s'..." % pdb_code) 215 _urlretrieve(url, filename) 216 217 # Uncompress the archive, delete when done 218 # Can't use context manager with gzip.open until Python 2.7 219 gz = gzip.open(filename, 'rb') 220 with open(final_file, 'wb') as out: 221 out.writelines(gz) 222 gz.close() 223 os.remove(filename) 224 225 return final_file
226
227 - def update_pdb(self):
228 """ 229 I guess this is the 'most wanted' function from this module. 230 It gets the weekly lists of new and modified pdb entries and 231 automatically downloads the according PDB files. 232 You can call this module as a weekly cronjob. 233 """ 234 assert os.path.isdir(self.local_pdb) 235 assert os.path.isdir(self.obsolete_pdb) 236 237 new, modified, obsolete = self.get_recent_changes() 238 239 for pdb_code in new + modified: 240 try: 241 self.retrieve_pdb_file(pdb_code) 242 except Exception: 243 print('error %s\n' % pdb_code) 244 # you can insert here some more log notes that 245 # something has gone wrong. 246 247 # Move the obsolete files to a special folder 248 for pdb_code in obsolete: 249 if self.flat_tree: 250 old_file = os.path.join(self.local_pdb, 251 'pdb%s.ent' % pdb_code) 252 new_dir = self.obsolete_pdb 253 else: 254 old_file = os.path.join(self.local_pdb, pdb_code[1:3], 255 'pdb%s.ent' % pdb_code) 256 new_dir = os.path.join(self.obsolete_pdb, pdb_code[1:3]) 257 new_file = os.path.join(new_dir, 'pdb%s.ent' % pdb_code) 258 if os.path.isfile(old_file): 259 if not os.path.isdir(new_dir): 260 os.mkdir(new_dir) 261 try: 262 shutil.move(old_file, new_file) 263 except Exception: 264 print("Could not move %s to obsolete folder" % old_file) 265 elif os.path.isfile(new_file): 266 print("Obsolete file %s already moved" % old_file) 267 else: 268 print("Obsolete file %s is missing" % old_file)
269
270 - def download_entire_pdb(self, listfile=None):
271 """Retrieve all PDB entries not present in the local PDB copy. 272 273 Writes a list file containing all PDB codes (optional, if listfile is 274 given). 275 """ 276 entries = self.get_all_entries() 277 for pdb_code in entries: 278 self.retrieve_pdb_file(pdb_code) 279 # Write the list 280 if listfile: 281 with open(listfile, 'w') as outfile: 282 outfile.writelines((x + '\n' for x in entries))
283
284 - def download_obsolete_entries(self, listfile=None):
285 """Retrieve all obsolete PDB entries not present in the local obsolete 286 PDB copy. 287 288 Writes a list file containing all PDB codes (optional, if listfile is 289 given). 290 """ 291 entries = self.get_all_obsolete() 292 for pdb_code in entries: 293 self.retrieve_pdb_file(pdb_code, obsolete=1) 294 295 # Write the list 296 if listfile: 297 with open(listfile, 'w') as outfile: 298 outfile.writelines((x + '\n' for x in entries))
299
300 - def get_seqres_file(self, savefile='pdb_seqres.txt'):
301 """Retrieves a (big) file containing all the sequences of PDB entries 302 and writes it to a file. 303 """ 304 print("Retrieving sequence file (takes about 15 MB).") 305 url = self.pdb_server + '/pub/pdb/derived_data/pdb_seqres.txt' 306 _urlretrieve(url, savefile)
307 308 309 if __name__ == '__main__': 310 311 import sys 312 313 doc = """PDBList.py 314 (c) Kristian Rother 2003, Contributed to BioPython 315 316 Usage: 317 PDBList.py update <pdb_path> [options] - write weekly PDB updates to 318 local pdb tree. 319 PDBList.py all <pdb_path> [options] - write all PDB entries to 320 local pdb tree. 321 PDBList.py obsol <pdb_path> [options] - write all obsolete PDB 322 entries to local pdb tree. 323 PDBList.py <PDB-ID> <pdb_path> [options] - retrieve single structure 324 325 Options: 326 -d A single directory will be used as <pdb_path>, not a tree. 327 -o Overwrite existing structure files. 328 """ 329 print(doc) 330 331 if len(sys.argv) > 2: 332 pdb_path = sys.argv[2] 333 pl = PDBList(pdb=pdb_path) 334 if len(sys.argv) > 3: 335 for option in sys.argv[3:]: 336 if option == '-d': 337 pl.flat_tree = 1 338 elif option == '-o': 339 pl.overwrite = 1 340 341 else: 342 pdb_path = os.getcwd() 343 pl = PDBList() 344 pl.flat_tree = 1 345 346 if len(sys.argv) > 1: 347 if sys.argv[1] == 'update': 348 # update PDB 349 print("updating local PDB at " + pdb_path) 350 pl.update_pdb() 351 352 elif sys.argv[1] == 'all': 353 # get the entire PDB 354 pl.download_entire_pdb() 355 356 elif sys.argv[1] == 'obsol': 357 # get all obsolete entries 358 pl.download_obsolete_entries(pdb_path) 359 360 elif len(sys.argv[1]) == 4 and sys.argv[1][0].isdigit(): 361 # get single PDB entry 362 pl.retrieve_pdb_file(sys.argv[1], pdir=pdb_path) 363