Package Bio :: Package PDB :: Module PDBList'
[hide private]
[frames] | no frames]

Source Code for Module Bio.PDB.PDBList'

  1  #!/usr/bin/env python 
  2  # 
  3  # PDBList.py 
  4  # 
  5  # A tool for tracking changes in the PDB Protein Structure Database. 
  6  # 
  7  # (c) 2003 Kristian Rother 
  8  # This work was supported by the German Ministry of Education 
  9  # and Research (BMBF). Project http://www.bcbio.de 
 10  # 
 11  # Contact the author 
 12  #    homepage : http://www.rubor.de/bioinf 
 13  #    email    : krother@genesilico.pl 
 14  # 
 15  # 
 16  # (c) 2016 Wiktoria Karwicka & Jacek Smietanski 
 17  #   - updated and Python 3.x compatible code 
 18  #   - new options to enable download PDBx/mmCif, PDBML and mmtf formatted 
 19  #       files as well as large PDB bundles 
 20  #   - unit tests for the module 
 21  # 
 22  # Contact the corresponding author 
 23  #   homepage : http://jaceksmietanski.net 
 24  #   email    : jacek.smietanski@ii.uj.edu.pl 
 25  # 
 26  # This code is released under the conditions of the Biopython license. 
 27  # Please see the LICENSE file that should have been included as part of this 
 28  # package. 
 29  # It may be distributed freely with respect to the original authors. 
 30  # Any maintainer of the Biopython code may change this notice 
 31  # when appropriate. 
 32   
 33  """Access the PDB over the internet (e.g. to download structures).""" 
 34   
 35  from __future__ import print_function 
 36   
 37  import contextlib 
 38  import gzip 
 39  import os 
 40  import shutil 
 41  import re 
 42  import sys 
 43   
 44  # Importing these functions with leading underscore as not intended for reuse 
 45  from Bio._py3k import _as_string 
 46  from Bio._py3k import urlopen as _urlopen 
 47  from Bio._py3k import urlretrieve as _urlretrieve 
 48  from Bio._py3k import urlcleanup as _urlcleanup 
49 50 51 -class PDBList(object):
52 """Quick access to the structure lists on the PDB or its mirrors. 53 54 This class provides quick access to the structure lists on the 55 PDB server or its mirrors. The structure lists contain 56 four-letter PDB codes, indicating that structures are 57 new, have been modified or are obsolete. The lists are released 58 on a weekly basis. 59 60 It also provides a function to retrieve PDB files from the server. 61 To use it properly, prepare a directory /pdb or the like, 62 where PDB files are stored. 63 64 All available file formats (PDB, PDBx/mmCif, PDBML, mmtf) are supported. 65 Please note that large structures (containing >62 chains 66 and/or 99999 ATOM lines) are no longer stored as a single PDB file 67 and by default (when PDB format selected) are not downloaded. 68 69 Large structures can be downloaded in other formats, including PDBx/mmCif 70 or as a .tar file (a collection of PDB-like formatted files for a given 71 structure). 72 73 If you want to use this module from inside a proxy, add 74 the proxy variable to your environment, e.g. in Unix: 75 export HTTP_PROXY='http://realproxy.charite.de:888' 76 (This can also be added to ~/.bashrc) 77 """ 78 79 PDB_REF = """ 80 The Protein Data Bank: a computer-based archival file for macromolecular structures. 81 F.C.Bernstein, T.F.Koetzle, G.J.B.Williams, E.F.Meyer Jr, M.D.Brice, J.R.Rodgers, O.Kennard, T.Shimanouchi, M.Tasumi 82 J. Mol. Biol. 112 pp. 535-542 (1977) 83 http://www.pdb.org/. 84 """ 85
86 - def __init__(self, server='ftp://ftp.wwpdb.org', pdb=os.getcwd(), 87 obsolete_pdb=None):
88 """Initialize the class with the default server or a custom one.""" 89 self.pdb_server = server # remote pdb server 90 self.local_pdb = pdb # local pdb file tree 91 92 # local file tree for obsolete pdb files 93 if obsolete_pdb: 94 self.obsolete_pdb = obsolete_pdb 95 else: 96 self.obsolete_pdb = os.path.join(self.local_pdb, 'obsolete') 97 if not os.access(self.obsolete_pdb, os.F_OK): 98 os.makedirs(self.obsolete_pdb) 99 100 # variable for command-line option 101 self.flat_tree = False
102 103 @staticmethod
104 - def _print_default_format_warning(file_format):
105 """Temporary warning (similar to a deprecation warning) that files are being downloaded in mmCIF""" 106 if file_format is None: 107 sys.stderr.write("WARNING: The default download format has changed from PDB to PDBx/mmCif\n") 108 return "mmCif" 109 return file_format
110 111 @staticmethod
112 - def get_status_list(url):
113 """Retrieves a list of pdb codes in the weekly pdb status file 114 from the given URL. Used by get_recent_files. 115 116 Typical contents of the list files parsed by this method is now 117 very simply one PDB name per line. 118 """ 119 with contextlib.closing(_urlopen(url)) as handle: 120 answer = [] 121 for line in handle: 122 pdb = line.strip() 123 assert len(pdb) == 4 124 answer.append(_as_string(pdb)) 125 return answer
126
127 - def get_recent_changes(self):
128 """Returns three lists of the newest weekly files (added,mod,obsolete). 129 130 Reads the directories with changed entries from the PDB server and 131 returns a tuple of three URL's to the files of new, modified and 132 obsolete entries from the most recent list. The directory with the 133 largest numerical name is used. 134 Returns None if something goes wrong. 135 136 Contents of the data/status dir (20031013 would be used);: 137 138 drwxrwxr-x 2 1002 sysadmin 512 Oct 6 18:28 20031006 139 drwxrwxr-x 2 1002 sysadmin 512 Oct 14 02:14 20031013 140 -rw-r--r-- 1 1002 sysadmin 1327 Mar 12 2001 README 141 142 """ 143 path = self.pdb_server + '/pub/pdb/data/status/latest/' 144 145 # Retrieve the lists 146 added = self.get_status_list(path + 'added.pdb') 147 modified = self.get_status_list(path + 'modified.pdb') 148 obsolete = self.get_status_list(path + 'obsolete.pdb') 149 return [added, modified, obsolete]
150
151 - def get_all_entries(self):
152 """Retrieves a big file containing all the PDB entries and some annotation. 153 154 Returns a list of PDB codes in the index file. 155 """ 156 url = self.pdb_server + '/pub/pdb/derived_data/index/entries.idx' 157 print("Retrieving index file. Takes about 27 MB.") 158 with contextlib.closing(_urlopen(url)) as handle: 159 all_entries = [_as_string(line[:4]) for line in handle.readlines()[2:] 160 if len(line) > 4] 161 return all_entries
162
163 - def get_all_obsolete(self):
164 """Returns a list of all obsolete entries ever in the PDB. 165 166 Returns a list of all obsolete pdb codes that have ever been 167 in the PDB. 168 169 Gets and parses the file from the PDB server in the format 170 (the first pdb_code column is the one used). The file looks 171 like this:: 172 173 LIST OF OBSOLETE COORDINATE ENTRIES AND SUCCESSORS 174 OBSLTE 31-JUL-94 116L 216L 175 ... 176 OBSLTE 29-JAN-96 1HFT 2HFT 177 OBSLTE 21-SEP-06 1HFV 2J5X 178 OBSLTE 21-NOV-03 1HG6 179 OBSLTE 18-JUL-84 1HHB 2HHB 3HHB 180 OBSLTE 08-NOV-96 1HID 2HID 181 OBSLTE 01-APR-97 1HIU 2HIU 182 OBSLTE 14-JAN-04 1HKE 1UUZ 183 ... 184 185 """ 186 url = self.pdb_server + '/pub/pdb/data/status/obsolete.dat' 187 with contextlib.closing(_urlopen(url)) as handle: 188 # Extract pdb codes. Could use a list comprehension, but I want 189 # to include an assert to check for mis-reading the data. 190 obsolete = [] 191 for line in handle: 192 if not line.startswith(b"OBSLTE "): 193 continue 194 pdb = line.split()[2] 195 assert len(pdb) == 4 196 obsolete.append(_as_string(pdb)) 197 return obsolete
198
199 - def retrieve_pdb_file(self, pdb_code, obsolete=False, pdir=None, file_format=None, overwrite=False):
200 """Fetch PDB structure file from PDB server, and store it locally. 201 202 The PDB structure's file name is returned as a single string. 203 If obsolete ``==`` True, the file will be saved in a special file tree. 204 205 NOTE. The default download format has changed from PDB to PDBx/mmCif 206 207 @param pdb_code: 4-symbols structure Id from PDB (e.g. 3J92). 208 @type pdb_code: string 209 210 @param file_format: 211 File format. Available options: 212 213 * "mmCif" (default, PDBx/mmCif file), 214 * "pdb" (format PDB), 215 * "xml" (PDBML/XML format), 216 * "mmtf" (highly compressed), 217 * "bundle" (PDB formatted archive for large structure} 218 219 @type file_format: string 220 221 @param overwrite: if set to True, existing structure files will be overwritten. Default: False 222 @type overwrite: bool 223 224 @param obsolete: 225 Has a meaning only for obsolete structures. If True, download the obsolete structure 226 to 'obsolete' folder, otherwise download won't be performed. 227 This option doesn't work for mmtf format as obsoleted structures aren't stored in mmtf. 228 Also doesn't have meaning when parameter pdir is specified. 229 Note: make sure that you are about to download the really obsolete structure. 230 Trying to download non-obsolete structure into obsolete folder will not work 231 and you face the "structure doesn't exists" error. 232 Default: False 233 234 @type obsolete: bool 235 236 @param pdir: put the file in this directory (default: create a PDB-style directory tree) 237 @type pdir: string 238 239 @return: filename 240 @rtype: string 241 """ 242 file_format = self._print_default_format_warning(file_format) # Deprecation warning 243 244 # Get the compressed PDB structure 245 code = pdb_code.lower() 246 archive = {'pdb': 'pdb%s.ent.gz', 'mmCif': '%s.cif.gz', 'xml': '%s.xml.gz', 'mmtf': '%s', 247 'bundle': '%s-pdb-bundle.tar.gz'} 248 archive_fn = archive[file_format] % code 249 250 if file_format not in archive.keys(): 251 raise("Specified file_format %s doesn't exists or is not supported. Maybe a typo. " 252 "Please, use one of the following: mmCif, pdb, xml, mmtf, bundle" % file_format) 253 254 if file_format in ('pdb', 'mmCif', 'xml'): 255 pdb_dir = "divided" if not obsolete else "obsolete" 256 file_type = "pdb" if file_format == "pdb" else "mmCIF" if file_format == "mmCif" else "XML" 257 url = (self.pdb_server + '/pub/pdb/data/structures/%s/%s/%s/%s' % (pdb_dir, file_type, code[1:3], archive_fn)) 258 elif file_format == 'bundle': 259 url = (self.pdb_server + '/pub/pdb/compatible/pdb_bundle/%s/%s/%s' % (code[1:3], code, archive_fn)) 260 else: 261 url = ('http://mmtf.rcsb.org/v1.0/full/%s' % code) 262 263 # Where does the final PDB file get saved? 264 if pdir is None: 265 path = self.local_pdb if not obsolete else self.obsolete_pdb 266 if not self.flat_tree: # Put in PDB-style directory tree 267 path = os.path.join(path, code[1:3]) 268 else: # Put in specified directory 269 path = pdir 270 if not os.access(path, os.F_OK): 271 os.makedirs(path) 272 filename = os.path.join(path, archive_fn) 273 final = {'pdb': 'pdb%s.ent', 'mmCif': '%s.cif', 'xml': '%s.xml', 274 'mmtf': '%s.mmtf', 'bundle': '%s-pdb-bundle.tar'} 275 final_file = os.path.join(path, final[file_format] % code) 276 277 # Skip download if the file already exists 278 if not overwrite: 279 if os.path.exists(final_file): 280 print("Structure exists: '%s' " % final_file) 281 return final_file 282 283 # Retrieve the file 284 print("Downloading PDB structure '%s'..." % pdb_code) 285 try: 286 _urlcleanup() 287 _urlretrieve(url, filename) 288 except IOError: 289 print("Desired structure doesn't exists") 290 else: 291 with gzip.open(filename, 'rb') as gz: 292 with open(final_file, 'wb') as out: 293 out.writelines(gz) 294 os.remove(filename) 295 return final_file
296
297 - def update_pdb(self, file_format=None):
298 """Update your local copy of the PDB files. 299 300 I guess this is the 'most wanted' function from this module. 301 It gets the weekly lists of new and modified pdb entries and 302 automatically downloads the according PDB files. 303 You can call this module as a weekly cron job. 304 """ 305 assert os.path.isdir(self.local_pdb) 306 assert os.path.isdir(self.obsolete_pdb) 307 308 file_format = self._print_default_format_warning(file_format) # Deprecation warning 309 310 new, modified, obsolete = self.get_recent_changes() 311 312 for pdb_code in new + modified: 313 try: 314 self.retrieve_pdb_file(pdb_code, file_format=file_format) 315 except Exception: 316 print('error %s\n' % pdb_code) 317 # you can insert here some more log notes that 318 # something has gone wrong. 319 320 # Move the obsolete files to a special folder 321 for pdb_code in obsolete: 322 if self.flat_tree: 323 old_file = os.path.join(self.local_pdb, 324 'pdb%s.ent' % pdb_code) 325 new_dir = self.obsolete_pdb 326 else: 327 old_file = os.path.join(self.local_pdb, pdb_code[1:3], 328 'pdb%s.ent' % pdb_code) 329 new_dir = os.path.join(self.obsolete_pdb, pdb_code[1:3]) 330 new_file = os.path.join(new_dir, 'pdb%s.ent' % pdb_code) 331 if os.path.isfile(old_file): 332 if not os.path.isdir(new_dir): 333 os.mkdir(new_dir) 334 try: 335 shutil.move(old_file, new_file) 336 except Exception: 337 print("Could not move %s to obsolete folder" % old_file) 338 elif os.path.isfile(new_file): 339 print("Obsolete file %s already moved" % old_file) 340 else: 341 print("Obsolete file %s is missing" % old_file)
342
343 - def download_pdb_files(self, pdb_codes, obsolete=False, pdir=None, file_format=None, overwrite=False):
344 """Fetch set of PDB structure files from the PDB server and stores them locally. 345 346 The PDB structure's file name is returned as a single string. 347 If obsolete ``==`` True, the files will be saved in a special file tree. 348 349 @param pdb_codes: a list of 4-symbols structure Ids from PDB 350 @type pdb_codes: list of strings 351 352 @param file_format: 353 File format. Available options: 354 355 * "mmCif" (default, PDBx/mmCif file), 356 * "pdb" (format PDB), 357 * "xml" (PMDML/XML format), 358 * "mmtf" (highly compressed), 359 * "bundle" (PDB formatted archive for large structure} 360 361 @param overwrite: if set to True, existing structure files will be overwritten. Default: False 362 @type overwrite: bool 363 364 @param obsolete: 365 Has a meaning only for obsolete structures. 366 If True, download the obsolete structure 367 to 'obsolete' folder, otherwise download won't be performed. 368 This option doesn't work for mmtf format as obsoleted structures are not availbe as mmtf. 369 (default: False) 370 371 @type obsolete: bool 372 373 @param pdir: put the file in this directory (default: create a PDB-style directory tree) 374 @type pdir: string 375 376 @return: filenames 377 @rtype: string 378 """ 379 file_format = self._print_default_format_warning(file_format) # Deprecation warning 380 for pdb_code in pdb_codes: 381 self.retrieve_pdb_file(pdb_code, obsolete=obsolete, pdir=pdir, file_format=file_format, overwrite=overwrite)
382
383 - def download_entire_pdb(self, listfile=None, file_format=None):
384 """Retrieve all PDB entries not present in the local PDB copy. 385 386 @param listfile: filename to which all PDB codes will be written (optional) 387 388 @param file_format: 389 File format. Available options: 390 391 * "mmCif" (default, PDBx/mmCif file), 392 * "pdb" (format PDB), 393 * "xml" (PMDML/XML format), 394 * "mmtf" (highly compressed), 395 * "bundle" (PDB formatted archive for large structure} 396 397 NOTE. The default download format has changed from PDB to PDBx/mmCif 398 """ 399 file_format = self._print_default_format_warning(file_format) # Deprecation warning 400 entries = self.get_all_entries() 401 for pdb_code in entries: 402 self.retrieve_pdb_file(pdb_code, file_format=file_format) 403 # Write the list 404 if listfile: 405 with open(listfile, 'w') as outfile: 406 outfile.writelines((x + '\n' for x in entries))
407
408 - def download_obsolete_entries(self, listfile=None, file_format=None):
409 """Retrieve all obsolete PDB entries not present in the local obsolete 410 PDB copy. 411 412 @param listfile: filename to which all PDB codes will be written (optional) 413 414 @param file_format: file format. Available options: 415 "mmCif" (default, PDBx/mmCif file), 416 "pdb" (format PDB), 417 "xml" (PMDML/XML format), 418 419 NOTE. The default download format has changed from PDB to PDBx/mmCif 420 """ 421 file_format = self._print_default_format_warning(file_format) # Deprecation warning 422 entries = self.get_all_obsolete() 423 for pdb_code in entries: 424 self.retrieve_pdb_file(pdb_code, obsolete=True, file_format=file_format) 425 426 # Write the list 427 if listfile: 428 with open(listfile, 'w') as outfile: 429 outfile.writelines((x + '\n' for x in entries))
430
431 - def get_seqres_file(self, savefile='pdb_seqres.txt'):
432 """Retrieves and save a (big) file containing all the sequences of PDB entries.""" 433 print("Retrieving sequence file (takes over 110 MB).") 434 url = self.pdb_server + '/pub/pdb/derived_data/pdb_seqres.txt' 435 _urlretrieve(url, savefile)
436 437 438 if __name__ == '__main__': 439 440 doc = """PDBList.py 441 (c) Kristian Rother 2003, Wiktoria Karwicka & Jacek Smietanski 2016 442 Contributed to Biopython 443 444 Usage:: 445 446 PDBList.py update <pdb_path> [options] - write weekly PDB updates to 447 local pdb tree. 448 PDBList.py all <pdb_path> [options] - write all PDB entries to 449 local pdb tree. 450 PDBList.py obsol <pdb_path> [options] - write all obsolete PDB 451 entries to local pdb tree. 452 PDBList.py <PDB-ID> <pdb_path> [options] - retrieve single structure 453 PDBList.py (<PDB-ID1>,<PDB-ID2>,...) <pdb_path> [options] - retrieve a set 454 of structures 455 456 Options: 457 -d A single directory will be used as <pdb_path>, not a tree. 458 -o Overwrite existing structure files. 459 -pdb Downloads structures in PDB format 460 -xml Downloads structures in PDBML (XML) format 461 -mmtf Downloads structures in mmtf format 462 463 Maximum one format can be specified simultaneously (if more selected, only 464 the last will be considered). By default (no format specified) structures are 465 downloaded as PDBx/mmCif files. 466 """ 467 print(doc) 468 469 file_format = "mmCif" 470 overwrite = False 471 472 if len(sys.argv) > 2: 473 pdb_path = sys.argv[2] 474 pl = PDBList(pdb=pdb_path) 475 if len(sys.argv) > 3: 476 for option in sys.argv[3:]: 477 if option == '-d': 478 pl.flat_tree = True 479 elif option == '-o': 480 overwrite = True 481 elif option in ('-pdb', '-xml', '-mmtf'): 482 file_format = option[1:] 483 else: 484 pdb_path = os.getcwd() 485 pl = PDBList() 486 pl.flat_tree = True 487 488 if len(sys.argv) > 1: 489 if sys.argv[1] == 'update': 490 # update PDB 491 print("updating local PDB at " + pdb_path) 492 pl.update_pdb(file_format=file_format) 493 494 elif sys.argv[1] == 'all': 495 # get the entire PDB 496 pl.download_entire_pdb(file_format=file_format) 497 498 elif sys.argv[1] == 'obsol': 499 # get all obsolete entries 500 pl.download_obsolete_entries(pdb_path, file_format=file_format) 501 502 elif len(sys.argv[1]) == 4 and sys.argv[1][0].isdigit(): 503 # get single PDB entry 504 pl.retrieve_pdb_file(sys.argv[1], pdir=pdb_path, file_format=file_format, overwrite=overwrite) 505 506 elif sys.argv[1][0] == '(': 507 # get a set of PDB entries 508 pdb_ids = re.findall(sys.argv[1], "[0-9A-Za-z]{4}") 509 for pdb_id in pdb_ids: 510 pl.retrieve_pdb_file(pdb_id, pdir=pdb_path, file_format=file_format, overwrite=overwrite) 511