Package Bio :: Package PDB :: Module PDBList'
[hide private]
[frames] | no frames]

Source Code for Module Bio.PDB.PDBList'

  1  #!/usr/bin/env python 
  2  # 
  3  # PDBList.py 
  4  # 
  5  # A tool for tracking changes in the PDB Protein Structure Database. 
  6  # 
  7  # (c) 2003 Kristian Rother 
  8  # This work was supported by the German Ministry of Education 
  9  # and Research (BMBF). Project http://www.bcbio.de 
 10  # 
 11  # Contact the author 
 12  #    homepage : http://www.rubor.de/bioinf 
 13  #    email    : krother@genesilico.pl 
 14  # 
 15  # 
 16  # (c) 2016 Wiktoria Karwicka & Jacek Smietanski 
 17  #   - updated and Python 3.x compatible code 
 18  #   - new options to enable download PDBx/mmCif, PDBML and mmtf formatted 
 19  #       files as well as large PDB bundles 
 20  #   - unit tests for the module 
 21  # 
 22  # Contact the corresponding author 
 23  #   homepage : http://jaceksmietanski.net 
 24  #   email    : jacek.smietanski@ii.uj.edu.pl 
 25  # 
 26  # This code is released under the conditions of the Biopython license. 
 27  # Please see the LICENSE file that should have been included as part of this 
 28  # package. 
 29  # It may be distributed freely with respect to the original authors. 
 30  # Any maintainer of the Biopython code may change this notice 
 31  # when appropriate. 
 32   
 33  """ Access the PDB over the internet (e.g. to download structures). """ 
 34   
 35  from __future__ import print_function 
 36   
 37  import contextlib 
 38  import gzip 
 39  import os 
 40  import shutil 
 41  import re 
 42  import sys 
 43   
 44  # Importing these functions with leading underscore as not intended for reuse 
 45  from Bio._py3k import _as_string 
 46  from Bio._py3k import urlopen as _urlopen 
 47  from Bio._py3k import urlretrieve as _urlretrieve 
 48  from Bio._py3k import urlcleanup as _urlcleanup 
49 50 51 -class PDBList(object):
52 """ 53 This class provides quick access to the structure lists on the 54 PDB server or its mirrors. The structure lists contain 55 four-letter PDB codes, indicating that structures are 56 new, have been modified or are obsolete. The lists are released 57 on a weekly basis. 58 59 It also provides a function to retrieve PDB files from the server. 60 To use it properly, prepare a directory /pdb or the like, 61 where PDB files are stored. 62 63 All available file formats (PDB, PDBx/mmCif, PDBML, mmtf) are supported. 64 Please note that large structures (containing >62 chains 65 and/or 99999 ATOM lines) are no longer stored as a single PDB file 66 and by default (when PDB format selected) are not downloaded. 67 68 Large structures can be downloaded in other formats, including PDBx/mmCif 69 or as a .tar file (a collection of PDB-like formatted files for a given 70 structure). 71 72 If you want to use this module from inside a proxy, add 73 the proxy variable to your environment, e.g. in Unix: 74 export HTTP_PROXY='http://realproxy.charite.de:888' 75 (This can also be added to ~/.bashrc) 76 """ 77 78 PDB_REF = """ 79 The Protein Data Bank: a computer-based archival file for macromolecular structures. 80 F.C.Bernstein, T.F.Koetzle, G.J.B.Williams, E.F.Meyer Jr, M.D.Brice, J.R.Rodgers, O.Kennard, T.Shimanouchi, M.Tasumi 81 J. Mol. Biol. 112 pp. 535-542 (1977) 82 http://www.pdb.org/. 83 """ 84
85 - def __init__(self, server='ftp://ftp.wwpdb.org', pdb=os.getcwd(), 86 obsolete_pdb=None):
87 """Initialize the class with the default server or a custom one.""" 88 self.pdb_server = server # remote pdb server 89 self.local_pdb = pdb # local pdb file tree 90 91 # local file tree for obsolete pdb files 92 if obsolete_pdb: 93 self.obsolete_pdb = obsolete_pdb 94 else: 95 self.obsolete_pdb = os.path.join(self.local_pdb, 'obsolete') 96 if not os.access(self.obsolete_pdb, os.F_OK): 97 os.makedirs(self.obsolete_pdb) 98 99 # variable for command-line option 100 self.flat_tree = False
101 102 @staticmethod
103 - def _print_default_format_warning(file_format):
104 """Temporary warning (similar to a deprecation warning) that files are being downloaded in mmCIF""" 105 if file_format is None: 106 sys.stderr.write("WARNING: The default download format has changed from PDB to PDBx/mmCif\n") 107 return "mmCif" 108 return file_format
109 110 @staticmethod
111 - def get_status_list(url):
112 """Retrieves a list of pdb codes in the weekly pdb status file 113 from the given URL. Used by get_recent_files. 114 115 Typical contents of the list files parsed by this method is now 116 very simply one PDB name per line. 117 """ 118 with contextlib.closing(_urlopen(url)) as handle: 119 answer = [] 120 for line in handle: 121 pdb = line.strip() 122 assert len(pdb) == 4 123 answer.append(_as_string(pdb)) 124 return answer
125
126 - def get_recent_changes(self):
127 """Returns three lists of the newest weekly files (added,mod,obsolete). 128 129 Reads the directories with changed entries from the PDB server and 130 returns a tuple of three URL's to the files of new, modified and 131 obsolete entries from the most recent list. The directory with the 132 largest numerical name is used. 133 Returns None if something goes wrong. 134 135 Contents of the data/status dir (20031013 would be used); 136 drwxrwxr-x 2 1002 sysadmin 512 Oct 6 18:28 20031006 137 drwxrwxr-x 2 1002 sysadmin 512 Oct 14 02:14 20031013 138 -rw-r--r-- 1 1002 sysadmin 1327 Mar 12 2001 README 139 """ 140 path = self.pdb_server + '/pub/pdb/data/status/latest/' 141 142 # Retrieve the lists 143 added = self.get_status_list(path + 'added.pdb') 144 modified = self.get_status_list(path + 'modified.pdb') 145 obsolete = self.get_status_list(path + 'obsolete.pdb') 146 return [added, modified, obsolete]
147
148 - def get_all_entries(self):
149 """Retrieves a big file containing all the 150 PDB entries and some annotation to them. 151 Returns a list of PDB codes in the index file. 152 """ 153 url = self.pdb_server + '/pub/pdb/derived_data/index/entries.idx' 154 print("Retrieving index file. Takes about 27 MB.") 155 with contextlib.closing(_urlopen(url)) as handle: 156 all_entries = [_as_string(line[:4]) for line in handle.readlines()[2:] 157 if len(line) > 4] 158 return all_entries
159
160 - def get_all_obsolete(self):
161 """Returns a list of all obsolete entries ever in the PDB. 162 163 Returns a list of all obsolete pdb codes that have ever been 164 in the PDB. 165 166 Gets and parses the file from the PDB server in the format 167 (the first pdb_code column is the one used). The file looks 168 like this:: 169 170 LIST OF OBSOLETE COORDINATE ENTRIES AND SUCCESSORS 171 OBSLTE 31-JUL-94 116L 216L 172 ... 173 OBSLTE 29-JAN-96 1HFT 2HFT 174 OBSLTE 21-SEP-06 1HFV 2J5X 175 OBSLTE 21-NOV-03 1HG6 176 OBSLTE 18-JUL-84 1HHB 2HHB 3HHB 177 OBSLTE 08-NOV-96 1HID 2HID 178 OBSLTE 01-APR-97 1HIU 2HIU 179 OBSLTE 14-JAN-04 1HKE 1UUZ 180 ... 181 182 """ 183 url = self.pdb_server + '/pub/pdb/data/status/obsolete.dat' 184 with contextlib.closing(_urlopen(url)) as handle: 185 # Extract pdb codes. Could use a list comprehension, but I want 186 # to include an assert to check for mis-reading the data. 187 obsolete = [] 188 for line in handle: 189 if not line.startswith(b"OBSLTE "): 190 continue 191 pdb = line.split()[2] 192 assert len(pdb) == 4 193 obsolete.append(_as_string(pdb)) 194 return obsolete
195
196 - def retrieve_pdb_file(self, pdb_code, obsolete=False, pdir=None, file_format=None, overwrite=False):
197 """Fetch PDB structure file from PDB server, and store it locally. 198 199 The PDB structure's file name is returned as a single string. 200 If obsolete ``==`` True, the file will be saved in a special file tree. 201 202 NOTE. The default download format has changed from PDB to PDBx/mmCif 203 204 @param pdb_code: 4-symbols structure Id from PDB (e.g. 3J92). 205 @type pdb_code: string 206 207 @param file_format: 208 File format. Available options: 209 210 * "mmCif" (default, PDBx/mmCif file), 211 * "pdb" (format PDB), 212 * "xml" (PDBML/XML format), 213 * "mmtf" (highly compressed), 214 * "bundle" (PDB formatted archive for large structure} 215 216 @type file_format: string 217 218 @param overwrite: if set to True, existing structure files will be overwritten. Default: False 219 @type overwrite: bool 220 221 @param obsolete: 222 Has a meaning only for obsolete structures. If True, download the obsolete structure 223 to 'obsolete' folder, otherwise download won't be performed. 224 This option doesn't work for mmtf format as obsoleted structures aren't stored in mmtf. 225 Also doesn't have meaning when parameter pdir is specified. 226 Note: make sure that you are about to download the really obsolete structure. 227 Trying to download non-obsolete structure into obsolete folder will not work 228 and you face the "structure doesn't exists" error. 229 Default: False 230 231 @type obsolete: bool 232 233 @param pdir: put the file in this directory (default: create a PDB-style directory tree) 234 @type pdir: string 235 236 @return: filename 237 @rtype: string 238 """ 239 file_format = self._print_default_format_warning(file_format) # Deprecation warning 240 241 # Get the compressed PDB structure 242 code = pdb_code.lower() 243 archive = {'pdb': 'pdb%s.ent.gz', 'mmCif': '%s.cif.gz', 'xml': '%s.xml.gz', 'mmtf': '%s', 244 'bundle': '%s-pdb-bundle.tar.gz'} 245 archive_fn = archive[file_format] % code 246 247 if file_format not in archive.keys(): 248 raise("Specified file_format %s doesn't exists or is not supported. Maybe a typo. " 249 "Please, use one of the following: mmCif, pdb, xml, mmtf, bundle" % file_format) 250 251 if file_format in ('pdb', 'mmCif', 'xml'): 252 pdb_dir = "divided" if not obsolete else "obsolete" 253 file_type = "pdb" if file_format == "pdb" else "mmCIF" if file_format == "mmCif" else "XML" 254 url = (self.pdb_server + '/pub/pdb/data/structures/%s/%s/%s/%s' % (pdb_dir, file_type, code[1:3], archive_fn)) 255 elif file_format == 'bundle': 256 url = (self.pdb_server + '/pub/pdb/compatible/pdb_bundle/%s/%s/%s' % (code[1:3], code, archive_fn)) 257 else: 258 url = ('http://mmtf.rcsb.org/v1.0/full/%s' % code) 259 260 # Where does the final PDB file get saved? 261 if pdir is None: 262 path = self.local_pdb if not obsolete else self.obsolete_pdb 263 if not self.flat_tree: # Put in PDB-style directory tree 264 path = os.path.join(path, code[1:3]) 265 else: # Put in specified directory 266 path = pdir 267 if not os.access(path, os.F_OK): 268 os.makedirs(path) 269 filename = os.path.join(path, archive_fn) 270 final = {'pdb': 'pdb%s.ent', 'mmCif': '%s.cif', 'xml': '%s.xml', 271 'mmtf': '%s.mmtf', 'bundle': '%s-pdb-bundle.tar'} 272 final_file = os.path.join(path, final[file_format] % code) 273 274 # Skip download if the file already exists 275 if not overwrite: 276 if os.path.exists(final_file): 277 print("Structure exists: '%s' " % final_file) 278 return final_file 279 280 # Retrieve the file 281 print("Downloading PDB structure '%s'..." % pdb_code) 282 try: 283 _urlcleanup() 284 _urlretrieve(url, filename) 285 except IOError: 286 print("Desired structure doesn't exists") 287 else: 288 with gzip.open(filename, 'rb') as gz: 289 with open(final_file, 'wb') as out: 290 out.writelines(gz) 291 os.remove(filename) 292 return final_file
293
294 - def update_pdb(self, file_format=None):
295 """ 296 I guess this is the 'most wanted' function from this module. 297 It gets the weekly lists of new and modified pdb entries and 298 automatically downloads the according PDB files. 299 You can call this module as a weekly cron job. 300 """ 301 assert os.path.isdir(self.local_pdb) 302 assert os.path.isdir(self.obsolete_pdb) 303 304 file_format = self._print_default_format_warning(file_format) # Deprecation warning 305 306 new, modified, obsolete = self.get_recent_changes() 307 308 for pdb_code in new + modified: 309 try: 310 self.retrieve_pdb_file(pdb_code, file_format=file_format) 311 except Exception: 312 print('error %s\n' % pdb_code) 313 # you can insert here some more log notes that 314 # something has gone wrong. 315 316 # Move the obsolete files to a special folder 317 for pdb_code in obsolete: 318 if self.flat_tree: 319 old_file = os.path.join(self.local_pdb, 320 'pdb%s.ent' % pdb_code) 321 new_dir = self.obsolete_pdb 322 else: 323 old_file = os.path.join(self.local_pdb, pdb_code[1:3], 324 'pdb%s.ent' % pdb_code) 325 new_dir = os.path.join(self.obsolete_pdb, pdb_code[1:3]) 326 new_file = os.path.join(new_dir, 'pdb%s.ent' % pdb_code) 327 if os.path.isfile(old_file): 328 if not os.path.isdir(new_dir): 329 os.mkdir(new_dir) 330 try: 331 shutil.move(old_file, new_file) 332 except Exception: 333 print("Could not move %s to obsolete folder" % old_file) 334 elif os.path.isfile(new_file): 335 print("Obsolete file %s already moved" % old_file) 336 else: 337 print("Obsolete file %s is missing" % old_file)
338
339 - def download_pdb_files(self, pdb_codes, obsolete=False, pdir=None, file_format=None, overwrite=False):
340 """Fetch set of PDB structure files from the PDB server and stores them locally. 341 342 The PDB structure's file name is returned as a single string. 343 If obsolete ``==`` True, the files will be saved in a special file tree. 344 345 @param pdb_codes: a list of 4-symbols structure Ids from PDB 346 @type pdb_codes: list of strings 347 348 @param file_format: 349 File format. Available options: 350 351 * "mmCif" (default, PDBx/mmCif file), 352 * "pdb" (format PDB), 353 * "xml" (PMDML/XML format), 354 * "mmtf" (highly compressed), 355 * "bundle" (PDB formatted archive for large structure} 356 357 @param overwrite: if set to True, existing structure files will be overwritten. Default: False 358 @type overwrite: bool 359 360 @param obsolete: 361 Has a meaning only for obsolete structures. 362 If True, download the obsolete structure 363 to 'obsolete' folder, otherwise download won't be performed. 364 This option doesn't work for mmtf format as obsoleted structures are not availbe as mmtf. 365 (default: False) 366 367 @type obsolete: bool 368 369 @param pdir: put the file in this directory (default: create a PDB-style directory tree) 370 @type pdir: string 371 372 @return: filenames 373 @rtype: string 374 """ 375 file_format = self._print_default_format_warning(file_format) # Deprecation warning 376 for pdb_code in pdb_codes: 377 self.retrieve_pdb_file(pdb_code, obsolete=obsolete, pdir=pdir, file_format=file_format, overwrite=overwrite)
378
379 - def download_entire_pdb(self, listfile=None, file_format=None):
380 """Retrieve all PDB entries not present in the local PDB copy. 381 382 @param listfile: filename to which all PDB codes will be written (optional) 383 384 @param file_format: 385 File format. Available options: 386 387 * "mmCif" (default, PDBx/mmCif file), 388 * "pdb" (format PDB), 389 * "xml" (PMDML/XML format), 390 * "mmtf" (highly compressed), 391 * "bundle" (PDB formatted archive for large structure} 392 393 NOTE. The default download format has changed from PDB to PDBx/mmCif 394 """ 395 file_format = self._print_default_format_warning(file_format) # Deprecation warning 396 entries = self.get_all_entries() 397 for pdb_code in entries: 398 self.retrieve_pdb_file(pdb_code, file_format=file_format) 399 # Write the list 400 if listfile: 401 with open(listfile, 'w') as outfile: 402 outfile.writelines((x + '\n' for x in entries))
403
404 - def download_obsolete_entries(self, listfile=None, file_format=None):
405 """Retrieve all obsolete PDB entries not present in the local obsolete 406 PDB copy. 407 408 @param listfile: filename to which all PDB codes will be written (optional) 409 410 @param file_format: file format. Available options: 411 "mmCif" (default, PDBx/mmCif file), 412 "pdb" (format PDB), 413 "xml" (PMDML/XML format), 414 415 NOTE. The default download format has changed from PDB to PDBx/mmCif 416 """ 417 file_format = self._print_default_format_warning(file_format) # Deprecation warning 418 entries = self.get_all_obsolete() 419 for pdb_code in entries: 420 self.retrieve_pdb_file(pdb_code, obsolete=True, file_format=file_format) 421 422 # Write the list 423 if listfile: 424 with open(listfile, 'w') as outfile: 425 outfile.writelines((x + '\n' for x in entries))
426
427 - def get_seqres_file(self, savefile='pdb_seqres.txt'):
428 """Retrieves a (big) file containing all the sequences of PDB entries 429 and writes it to a file. 430 """ 431 print("Retrieving sequence file (takes over 110 MB).") 432 url = self.pdb_server + '/pub/pdb/derived_data/pdb_seqres.txt' 433 _urlretrieve(url, savefile)
434 435 436 if __name__ == '__main__': 437 438 doc = """PDBList.py 439 (c) Kristian Rother 2003, Wiktoria Karwicka & Jacek Smietanski 2016 440 Contributed to Biopython 441 442 Usage: 443 PDBList.py update <pdb_path> [options] - write weekly PDB updates to 444 local pdb tree. 445 PDBList.py all <pdb_path> [options] - write all PDB entries to 446 local pdb tree. 447 PDBList.py obsol <pdb_path> [options] - write all obsolete PDB 448 entries to local pdb tree. 449 PDBList.py <PDB-ID> <pdb_path> [options] - retrieve single structure 450 PDBList.py (<PDB-ID1>,<PDB-ID2>,...) <pdb_path> [options] - retrieve a set 451 of structures 452 453 Options: 454 -d A single directory will be used as <pdb_path>, not a tree. 455 -o Overwrite existing structure files. 456 -pdb Downloads structures in PDB format 457 -xml Downloads structures in PDBML (XML) format 458 -mmtf Downloads structures in mmtf format 459 460 Maximum one format can be specified simultaneously (if more selected, only 461 the last will be considered). By default (no format specified) structures are 462 downloaded as PDBx/mmCif files. 463 """ 464 print(doc) 465 466 file_format = "mmCif" 467 overwrite = False 468 469 if len(sys.argv) > 2: 470 pdb_path = sys.argv[2] 471 pl = PDBList(pdb=pdb_path) 472 if len(sys.argv) > 3: 473 for option in sys.argv[3:]: 474 if option == '-d': 475 pl.flat_tree = True 476 elif option == '-o': 477 overwrite = True 478 elif option in ('-pdb', '-xml', '-mmtf'): 479 file_format = option[1:] 480 else: 481 pdb_path = os.getcwd() 482 pl = PDBList() 483 pl.flat_tree = True 484 485 if len(sys.argv) > 1: 486 if sys.argv[1] == 'update': 487 # update PDB 488 print("updating local PDB at " + pdb_path) 489 pl.update_pdb(file_format=file_format) 490 491 elif sys.argv[1] == 'all': 492 # get the entire PDB 493 pl.download_entire_pdb(file_format=file_format) 494 495 elif sys.argv[1] == 'obsol': 496 # get all obsolete entries 497 pl.download_obsolete_entries(pdb_path, file_format=file_format) 498 499 elif len(sys.argv[1]) == 4 and sys.argv[1][0].isdigit(): 500 # get single PDB entry 501 pl.retrieve_pdb_file(sys.argv[1], pdir=pdb_path, file_format=file_format, overwrite=overwrite) 502 503 elif sys.argv[1][0] == '(': 504 # get a set of PDB entries 505 pdb_ids = re.findall(sys.argv[1], "[0-9A-Za-z]{4}") 506 for pdb_id in pdb_ids: 507 pl.retrieve_pdb_file(pdb_id, pdir=pdb_path, file_format=file_format, overwrite=overwrite) 508