Package Bio :: Package PDB :: Module MMCIFParser'
[hide private]
[frames] | no frames]

Source Code for Module Bio.PDB.MMCIFParser'

  1  # Copyright (C) 2002, Thomas Hamelryck (thamelry@binf.ku.dk) 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5   
  6  """mmCIF parsers""" 
  7   
  8  from __future__ import print_function 
  9   
 10  from string import ascii_letters 
 11   
 12  import numpy 
 13  import warnings 
 14   
 15  from Bio.File import as_handle 
 16  from Bio._py3k import range 
 17   
 18  from Bio.PDB.MMCIF2Dict import MMCIF2Dict 
 19  from Bio.PDB.StructureBuilder import StructureBuilder 
 20  from Bio.PDB.PDBExceptions import PDBConstructionException 
 21  from Bio.PDB.PDBExceptions import PDBConstructionWarning 
 22   
 23   
24 -class MMCIFParser(object):
25 """Parse a mmCIF file and return a Structure object.""" 26
27 - def __init__(self, structure_builder=None, QUIET=False):
28 """Create a PDBParser object. 29 30 The mmCIF parser calls a number of standard methods in an aggregated 31 StructureBuilder object. Normally this object is instanciated by the 32 MMCIParser object itself, but if the user provides his/her own 33 StructureBuilder object, the latter is used instead. 34 35 Arguments: 36 - structure_builder - an optional user implemented StructureBuilder class. 37 - QUIET - Evaluated as a Boolean. If true, warnings issued in constructing 38 the SMCRA data will be suppressed. If false (DEFAULT), they will be shown. 39 These warnings might be indicative of problems in the mmCIF file! 40 """ 41 if structure_builder is not None: 42 self._structure_builder = structure_builder 43 else: 44 self._structure_builder = StructureBuilder() 45 # self.header = None 46 # self.trailer = None 47 self.line_counter = 0 48 self.build_structure = None 49 self.QUIET = bool(QUIET)
50 51 # Public methods 52
53 - def get_structure(self, structure_id, filename):
54 """Return the structure. 55 56 Arguments: 57 - structure_id - string, the id that will be used for the structure 58 - filename - name of the mmCIF file OR an open filehandle 59 """ 60 with warnings.catch_warnings(): 61 if self.QUIET: 62 warnings.filterwarnings("ignore", category=PDBConstructionWarning) 63 self._mmcif_dict = MMCIF2Dict(filename) 64 self._build_structure(structure_id) 65 66 return self._structure_builder.get_structure()
67 68 # Private methods 69
70 - def _build_structure(self, structure_id):
71 mmcif_dict = self._mmcif_dict 72 atom_id_list = mmcif_dict["_atom_site.label_atom_id"] 73 residue_id_list = mmcif_dict["_atom_site.label_comp_id"] 74 try: 75 element_list = mmcif_dict["_atom_site.type_symbol"] 76 except KeyError: 77 element_list = None 78 seq_id_list = mmcif_dict["_atom_site.label_seq_id"] 79 chain_id_list = mmcif_dict["_atom_site.auth_asym_id"] 80 x_list = [float(x) for x in mmcif_dict["_atom_site.Cartn_x"]] 81 y_list = [float(x) for x in mmcif_dict["_atom_site.Cartn_y"]] 82 z_list = [float(x) for x in mmcif_dict["_atom_site.Cartn_z"]] 83 alt_list = mmcif_dict["_atom_site.label_alt_id"] 84 icode_list = mmcif_dict["_atom_site.pdbx_PDB_ins_code"] 85 b_factor_list = mmcif_dict["_atom_site.B_iso_or_equiv"] 86 occupancy_list = mmcif_dict["_atom_site.occupancy"] 87 fieldname_list = mmcif_dict["_atom_site.group_PDB"] 88 try: 89 serial_list = [int(n) for n in mmcif_dict["_atom_site.pdbx_PDB_model_num"]] 90 except KeyError: 91 # No model number column 92 serial_list = None 93 except ValueError: 94 # Invalid model number (malformed file) 95 raise PDBConstructionException("Invalid model number") 96 try: 97 aniso_u11 = mmcif_dict["_atom_site.aniso_U[1][1]"] 98 aniso_u12 = mmcif_dict["_atom_site.aniso_U[1][2]"] 99 aniso_u13 = mmcif_dict["_atom_site.aniso_U[1][3]"] 100 aniso_u22 = mmcif_dict["_atom_site.aniso_U[2][2]"] 101 aniso_u23 = mmcif_dict["_atom_site.aniso_U[2][3]"] 102 aniso_u33 = mmcif_dict["_atom_site.aniso_U[3][3]"] 103 aniso_flag = 1 104 except KeyError: 105 # no anisotropic B factors 106 aniso_flag = 0 107 # if auth_seq_id is present, we use this. 108 # Otherwise label_seq_id is used. 109 if "_atom_site.auth_seq_id" in mmcif_dict: 110 seq_id_list = mmcif_dict["_atom_site.auth_seq_id"] 111 else: 112 seq_id_list = mmcif_dict["_atom_site.label_seq_id"] 113 # Now loop over atoms and build the structure 114 current_chain_id = None 115 current_residue_id = None 116 structure_builder = self._structure_builder 117 structure_builder.init_structure(structure_id) 118 structure_builder.init_seg(" ") 119 # Historically, Biopython PDB parser uses model_id to mean array index 120 # so serial_id means the Model ID specified in the file 121 current_model_id = -1 122 current_serial_id = -1 123 for i in range(0, len(atom_id_list)): 124 125 # set the line_counter for 'ATOM' lines only and not 126 # as a global line counter found in the PDBParser() 127 # this number should match the '_atom_site.id' index in the MMCIF 128 structure_builder.set_line_counter(i) 129 130 x = x_list[i] 131 y = y_list[i] 132 z = z_list[i] 133 resname = residue_id_list[i] 134 chainid = chain_id_list[i] 135 altloc = alt_list[i] 136 if altloc == ".": 137 altloc = " " 138 int_resseq = int(seq_id_list[i]) 139 icode = icode_list[i] 140 if icode == "?": 141 icode = " " 142 name = atom_id_list[i] 143 # occupancy & B factor 144 try: 145 tempfactor = float(b_factor_list[i]) 146 except ValueError: 147 raise PDBConstructionException("Invalid or missing B factor") 148 try: 149 occupancy = float(occupancy_list[i]) 150 except ValueError: 151 raise PDBConstructionException("Invalid or missing occupancy") 152 fieldname = fieldname_list[i] 153 if fieldname == "HETATM": 154 hetatm_flag = "H" 155 else: 156 hetatm_flag = " " 157 158 resseq = (hetatm_flag, int_resseq, icode) 159 160 if serial_list is not None: 161 # model column exists; use it 162 serial_id = serial_list[i] 163 if current_serial_id != serial_id: 164 # if serial changes, update it and start new model 165 current_serial_id = serial_id 166 current_model_id += 1 167 structure_builder.init_model(current_model_id, current_serial_id) 168 current_chain_id = None 169 current_residue_id = None 170 else: 171 # no explicit model column; initialize single model 172 structure_builder.init_model(current_model_id) 173 174 if current_chain_id != chainid: 175 current_chain_id = chainid 176 structure_builder.init_chain(current_chain_id) 177 current_residue_id = None 178 179 if current_residue_id != resseq: 180 current_residue_id = resseq 181 structure_builder.init_residue(resname, hetatm_flag, int_resseq, icode) 182 183 coord = numpy.array((x, y, z), 'f') 184 element = element_list[i] if element_list else None 185 structure_builder.init_atom(name, coord, tempfactor, occupancy, altloc, 186 name, element=element) 187 if aniso_flag == 1: 188 u = (aniso_u11[i], aniso_u12[i], aniso_u13[i], 189 aniso_u22[i], aniso_u23[i], aniso_u33[i]) 190 mapped_anisou = [float(x) for x in u] 191 anisou_array = numpy.array(mapped_anisou, 'f') 192 structure_builder.set_anisou(anisou_array) 193 # Now try to set the cell 194 try: 195 a = float(mmcif_dict["_cell.length_a"]) 196 b = float(mmcif_dict["_cell.length_b"]) 197 c = float(mmcif_dict["_cell.length_c"]) 198 alpha = float(mmcif_dict["_cell.angle_alpha"]) 199 beta = float(mmcif_dict["_cell.angle_beta"]) 200 gamma = float(mmcif_dict["_cell.angle_gamma"]) 201 cell = numpy.array((a, b, c, alpha, beta, gamma), 'f') 202 spacegroup = mmcif_dict["_symmetry.space_group_name_H-M"] 203 spacegroup = spacegroup[1:-1] # get rid of quotes!! 204 if spacegroup is None: 205 raise Exception 206 structure_builder.set_symmetry(spacegroup, cell) 207 except Exception: 208 pass # no cell found, so just ignore
209 210
211 -class FastMMCIFParser(object):
212 """Parse an MMCIF file and return a Structure object.""" 213
214 - def __init__(self, structure_builder=None, QUIET=False):
215 """Create a FastMMCIFParser object. 216 217 The mmCIF parser calls a number of standard methods in an aggregated 218 StructureBuilder object. Normally this object is instanciated by the 219 parser object itself, but if the user provides his/her own 220 StructureBuilder object, the latter is used instead. 221 222 The main difference between this class and the regular MMCIFParser is 223 that only 'ATOM' and 'HETATM' lines are parsed here. Use if you are 224 interested only in coordinate information. 225 226 Arguments: 227 - structure_builder - an optional user implemented StructureBuilder class. 228 - QUIET - Evaluated as a Boolean. If true, warnings issued in constructing 229 the SMCRA data will be suppressed. If false (DEFAULT), they will be shown. 230 These warnings might be indicative of problems in the mmCIF file! 231 """ 232 if structure_builder is not None: 233 self._structure_builder = structure_builder 234 else: 235 self._structure_builder = StructureBuilder() 236 237 self.line_counter = 0 238 self.build_structure = None 239 self.QUIET = bool(QUIET)
240 241 # Public methods 242
243 - def get_structure(self, structure_id, filename):
244 """Return the structure. 245 246 Arguments: 247 - structure_id - string, the id that will be used for the structure 248 - filename - name of the mmCIF file OR an open filehandle 249 """ 250 with warnings.catch_warnings(): 251 if self.QUIET: 252 warnings.filterwarnings("ignore", category=PDBConstructionWarning) 253 with as_handle(filename) as handle: 254 self._build_structure(structure_id, handle) 255 256 return self._structure_builder.get_structure()
257 258 # Private methods 259
260 - def _build_structure(self, structure_id, filehandle):
261 262 # Read only _atom_site. and atom_site_anisotrop entries 263 read_atom, read_aniso = False, False 264 _fields, _records = [], [] 265 _anisof, _anisors = [], [] 266 for line in filehandle: 267 if line.startswith('_atom_site.'): 268 read_atom = True 269 _fields.append(line.strip()) 270 elif line.startswith('_atom_site_anisotrop.'): 271 read_aniso = True 272 _anisof.append(line.strip()) 273 elif read_atom and line.startswith('#'): 274 read_atom = False 275 elif read_aniso and line.startswith('#'): 276 read_aniso = False 277 elif read_atom: 278 _records.append(line.strip()) 279 elif read_aniso: 280 _anisors.append(line.strip()) 281 282 # Dumping the shlex module here since this particular 283 # category should be rather straightforward. 284 # Quite a performance boost.. 285 _record_tbl = zip(*map(str.split, _records)) 286 _anisob_tbl = zip(*map(str.split, _anisors)) 287 288 mmcif_dict = dict(zip(_fields, _record_tbl)) 289 mmcif_dict.update(dict(zip(_anisof, _anisob_tbl))) 290 291 # Build structure object 292 atom_id_list = mmcif_dict["_atom_site.label_atom_id"] 293 residue_id_list = mmcif_dict["_atom_site.label_comp_id"] 294 295 try: 296 element_list = mmcif_dict["_atom_site.type_symbol"] 297 except KeyError: 298 element_list = None 299 300 seq_id_list = mmcif_dict["_atom_site.label_seq_id"] 301 chain_id_list = mmcif_dict["_atom_site.auth_asym_id"] 302 303 x_list = [float(x) for x in mmcif_dict["_atom_site.Cartn_x"]] 304 y_list = [float(x) for x in mmcif_dict["_atom_site.Cartn_y"]] 305 z_list = [float(x) for x in mmcif_dict["_atom_site.Cartn_z"]] 306 alt_list = mmcif_dict["_atom_site.label_alt_id"] 307 icode_list = mmcif_dict["_atom_site.pdbx_PDB_ins_code"] 308 b_factor_list = mmcif_dict["_atom_site.B_iso_or_equiv"] 309 occupancy_list = mmcif_dict["_atom_site.occupancy"] 310 fieldname_list = mmcif_dict["_atom_site.group_PDB"] 311 312 try: 313 serial_list = [int(n) for n in mmcif_dict["_atom_site.pdbx_PDB_model_num"]] 314 except KeyError: 315 # No model number column 316 serial_list = None 317 except ValueError: 318 # Invalid model number (malformed file) 319 raise PDBConstructionException("Invalid model number") 320 321 try: 322 aniso_u11 = mmcif_dict["_atom_site.aniso_U[1][1]"] 323 aniso_u12 = mmcif_dict["_atom_site.aniso_U[1][2]"] 324 aniso_u13 = mmcif_dict["_atom_site.aniso_U[1][3]"] 325 aniso_u22 = mmcif_dict["_atom_site.aniso_U[2][2]"] 326 aniso_u23 = mmcif_dict["_atom_site.aniso_U[2][3]"] 327 aniso_u33 = mmcif_dict["_atom_site.aniso_U[3][3]"] 328 aniso_flag = 1 329 except KeyError: 330 # no anisotropic B factors 331 aniso_flag = 0 332 333 # if auth_seq_id is present, we use this. 334 # Otherwise label_seq_id is used. 335 if "_atom_site.auth_seq_id" in mmcif_dict: 336 seq_id_list = mmcif_dict["_atom_site.auth_seq_id"] 337 else: 338 seq_id_list = mmcif_dict["_atom_site.label_seq_id"] 339 340 # Now loop over atoms and build the structure 341 current_chain_id = None 342 current_residue_id = None 343 structure_builder = self._structure_builder 344 structure_builder.init_structure(structure_id) 345 structure_builder.init_seg(" ") 346 347 # Historically, Biopython PDB parser uses model_id to mean array index 348 # so serial_id means the Model ID specified in the file 349 current_model_id = -1 350 current_serial_id = -1 351 for i in range(0, len(atom_id_list)): 352 353 # set the line_counter for 'ATOM' lines only and not 354 # as a global line counter found in the PDBParser() 355 # this number should match the '_atom_site.id' index in the MMCIF 356 structure_builder.set_line_counter(i) 357 358 x = x_list[i] 359 y = y_list[i] 360 z = z_list[i] 361 resname = residue_id_list[i] 362 chainid = chain_id_list[i] 363 altloc = alt_list[i] 364 if altloc == ".": 365 altloc = " " 366 int_resseq = int(seq_id_list[i]) 367 icode = icode_list[i] 368 if icode == "?": 369 icode = " " 370 name = atom_id_list[i].strip('"') # Remove occasional " from quoted atom names (e.g. xNA) 371 372 # occupancy & B factor 373 try: 374 tempfactor = float(b_factor_list[i]) 375 except ValueError: 376 raise PDBConstructionException("Invalid or missing B factor") 377 378 try: 379 occupancy = float(occupancy_list[i]) 380 except ValueError: 381 raise PDBConstructionException("Invalid or missing occupancy") 382 383 fieldname = fieldname_list[i] 384 if fieldname == "HETATM": 385 hetatm_flag = "H" 386 else: 387 hetatm_flag = " " 388 389 resseq = (hetatm_flag, int_resseq, icode) 390 391 if serial_list is not None: 392 # model column exists; use it 393 serial_id = serial_list[i] 394 if current_serial_id != serial_id: 395 # if serial changes, update it and start new model 396 current_serial_id = serial_id 397 current_model_id += 1 398 structure_builder.init_model(current_model_id, current_serial_id) 399 current_chain_id = None 400 current_residue_id = None 401 else: 402 # no explicit model column; initialize single model 403 structure_builder.init_model(current_model_id) 404 405 if current_chain_id != chainid: 406 current_chain_id = chainid 407 structure_builder.init_chain(current_chain_id) 408 current_residue_id = None 409 410 if current_residue_id != resseq: 411 current_residue_id = resseq 412 structure_builder.init_residue(resname, hetatm_flag, int_resseq, icode) 413 414 coord = numpy.array((x, y, z), 'f') 415 element = element_list[i] if element_list else None 416 structure_builder.init_atom(name, coord, tempfactor, occupancy, altloc, 417 name, element=element) 418 if aniso_flag == 1: 419 u = (aniso_u11[i], aniso_u12[i], aniso_u13[i], 420 aniso_u22[i], aniso_u23[i], aniso_u33[i]) 421 mapped_anisou = [float(x) for x in u] 422 anisou_array = numpy.array(mapped_anisou, 'f') 423 structure_builder.set_anisou(anisou_array)
424 425 if __name__ == "__main__": 426 import sys 427 428 if len(sys.argv) != 2: 429 print("Usage: python MMCIFparser.py filename") 430 raise SystemExit 431 filename = sys.argv[1] 432 433 p = MMCIFParser() 434 435 structure = p.get_structure("test", filename) 436 437 for model in structure.get_list(): 438 print(model) 439 for chain in model.get_list(): 440 print(chain) 441 print("Found %d residues." % len(chain.get_list())) 442