Package Bio :: Package PDB :: Module MMCIFParser'
[hide private]
[frames] | no frames]

Source Code for Module Bio.PDB.MMCIFParser'

  1  # Copyright (C) 2002, Thomas Hamelryck (thamelry@binf.ku.dk) 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5   
  6  """mmCIF parsers""" 
  7   
  8  from __future__ import print_function 
  9   
 10  import numpy 
 11  import warnings 
 12   
 13  from Bio.File import as_handle 
 14  from Bio._py3k import range 
 15   
 16  from Bio.PDB.MMCIF2Dict import MMCIF2Dict 
 17  from Bio.PDB.StructureBuilder import StructureBuilder 
 18  from Bio.PDB.PDBExceptions import PDBConstructionException 
 19  from Bio.PDB.PDBExceptions import PDBConstructionWarning 
 20   
 21   
22 -class MMCIFParser(object):
23 """Parse a mmCIF file and return a Structure object.""" 24
25 - def __init__(self, structure_builder=None, QUIET=False):
26 """Create a PDBParser object. 27 28 The mmCIF parser calls a number of standard methods in an aggregated 29 StructureBuilder object. Normally this object is instanciated by the 30 MMCIParser object itself, but if the user provides his/her own 31 StructureBuilder object, the latter is used instead. 32 33 Arguments: 34 - structure_builder - an optional user implemented StructureBuilder class. 35 - QUIET - Evaluated as a Boolean. If true, warnings issued in constructing 36 the SMCRA data will be suppressed. If false (DEFAULT), they will be shown. 37 These warnings might be indicative of problems in the mmCIF file! 38 """ 39 if structure_builder is not None: 40 self._structure_builder = structure_builder 41 else: 42 self._structure_builder = StructureBuilder() 43 # self.header = None 44 # self.trailer = None 45 self.line_counter = 0 46 self.build_structure = None 47 self.QUIET = bool(QUIET)
48 49 # Public methods 50
51 - def get_structure(self, structure_id, filename):
52 """Return the structure. 53 54 Arguments: 55 - structure_id - string, the id that will be used for the structure 56 - filename - name of the mmCIF file OR an open filehandle 57 """ 58 with warnings.catch_warnings(): 59 if self.QUIET: 60 warnings.filterwarnings("ignore", category=PDBConstructionWarning) 61 self._mmcif_dict = MMCIF2Dict(filename) 62 self._build_structure(structure_id) 63 64 return self._structure_builder.get_structure()
65 66 # Private methods 67
68 - def _build_structure(self, structure_id):
69 mmcif_dict = self._mmcif_dict 70 atom_id_list = mmcif_dict["_atom_site.label_atom_id"] 71 residue_id_list = mmcif_dict["_atom_site.label_comp_id"] 72 try: 73 element_list = mmcif_dict["_atom_site.type_symbol"] 74 except KeyError: 75 element_list = None 76 seq_id_list = mmcif_dict["_atom_site.label_seq_id"] 77 chain_id_list = mmcif_dict["_atom_site.auth_asym_id"] 78 x_list = [float(x) for x in mmcif_dict["_atom_site.Cartn_x"]] 79 y_list = [float(x) for x in mmcif_dict["_atom_site.Cartn_y"]] 80 z_list = [float(x) for x in mmcif_dict["_atom_site.Cartn_z"]] 81 alt_list = mmcif_dict["_atom_site.label_alt_id"] 82 icode_list = mmcif_dict["_atom_site.pdbx_PDB_ins_code"] 83 b_factor_list = mmcif_dict["_atom_site.B_iso_or_equiv"] 84 occupancy_list = mmcif_dict["_atom_site.occupancy"] 85 fieldname_list = mmcif_dict["_atom_site.group_PDB"] 86 try: 87 serial_list = [int(n) for n in mmcif_dict["_atom_site.pdbx_PDB_model_num"]] 88 except KeyError: 89 # No model number column 90 serial_list = None 91 except ValueError: 92 # Invalid model number (malformed file) 93 raise PDBConstructionException("Invalid model number") 94 try: 95 aniso_u11 = mmcif_dict["_atom_site.aniso_U[1][1]"] 96 aniso_u12 = mmcif_dict["_atom_site.aniso_U[1][2]"] 97 aniso_u13 = mmcif_dict["_atom_site.aniso_U[1][3]"] 98 aniso_u22 = mmcif_dict["_atom_site.aniso_U[2][2]"] 99 aniso_u23 = mmcif_dict["_atom_site.aniso_U[2][3]"] 100 aniso_u33 = mmcif_dict["_atom_site.aniso_U[3][3]"] 101 aniso_flag = 1 102 except KeyError: 103 # no anisotropic B factors 104 aniso_flag = 0 105 # if auth_seq_id is present, we use this. 106 # Otherwise label_seq_id is used. 107 if "_atom_site.auth_seq_id" in mmcif_dict: 108 seq_id_list = mmcif_dict["_atom_site.auth_seq_id"] 109 else: 110 seq_id_list = mmcif_dict["_atom_site.label_seq_id"] 111 # Now loop over atoms and build the structure 112 current_chain_id = None 113 current_residue_id = None 114 current_resname = None 115 structure_builder = self._structure_builder 116 structure_builder.init_structure(structure_id) 117 structure_builder.init_seg(" ") 118 # Historically, Biopython PDB parser uses model_id to mean array index 119 # so serial_id means the Model ID specified in the file 120 current_model_id = -1 121 current_serial_id = -1 122 for i in range(0, len(atom_id_list)): 123 124 # set the line_counter for 'ATOM' lines only and not 125 # as a global line counter found in the PDBParser() 126 # this number should match the '_atom_site.id' index in the MMCIF 127 structure_builder.set_line_counter(i) 128 129 x = x_list[i] 130 y = y_list[i] 131 z = z_list[i] 132 resname = residue_id_list[i] 133 chainid = chain_id_list[i] 134 altloc = alt_list[i] 135 if altloc == ".": 136 altloc = " " 137 int_resseq = int(seq_id_list[i]) 138 icode = icode_list[i] 139 if icode == "?": 140 icode = " " 141 name = atom_id_list[i] 142 # occupancy & B factor 143 try: 144 tempfactor = float(b_factor_list[i]) 145 except ValueError: 146 raise PDBConstructionException("Invalid or missing B factor") 147 try: 148 occupancy = float(occupancy_list[i]) 149 except ValueError: 150 raise PDBConstructionException("Invalid or missing occupancy") 151 fieldname = fieldname_list[i] 152 if fieldname == "HETATM": 153 hetatm_flag = "H" 154 else: 155 hetatm_flag = " " 156 157 resseq = (hetatm_flag, int_resseq, icode) 158 159 if serial_list is not None: 160 # model column exists; use it 161 serial_id = serial_list[i] 162 if current_serial_id != serial_id: 163 # if serial changes, update it and start new model 164 current_serial_id = serial_id 165 current_model_id += 1 166 structure_builder.init_model(current_model_id, current_serial_id) 167 current_chain_id = None 168 current_residue_id = None 169 current_resname = None 170 else: 171 # no explicit model column; initialize single model 172 structure_builder.init_model(current_model_id) 173 174 if current_chain_id != chainid: 175 current_chain_id = chainid 176 structure_builder.init_chain(current_chain_id) 177 current_residue_id = None 178 current_resname = None 179 180 if current_residue_id != resseq or current_resname != resname: 181 current_residue_id = resseq 182 current_resname = resname 183 structure_builder.init_residue(resname, hetatm_flag, int_resseq, icode) 184 185 coord = numpy.array((x, y, z), 'f') 186 element = element_list[i] if element_list else None 187 structure_builder.init_atom(name, coord, tempfactor, occupancy, altloc, 188 name, element=element) 189 if aniso_flag == 1: 190 u = (aniso_u11[i], aniso_u12[i], aniso_u13[i], 191 aniso_u22[i], aniso_u23[i], aniso_u33[i]) 192 mapped_anisou = [float(x) for x in u] 193 anisou_array = numpy.array(mapped_anisou, 'f') 194 structure_builder.set_anisou(anisou_array) 195 # Now try to set the cell 196 try: 197 a = float(mmcif_dict["_cell.length_a"]) 198 b = float(mmcif_dict["_cell.length_b"]) 199 c = float(mmcif_dict["_cell.length_c"]) 200 alpha = float(mmcif_dict["_cell.angle_alpha"]) 201 beta = float(mmcif_dict["_cell.angle_beta"]) 202 gamma = float(mmcif_dict["_cell.angle_gamma"]) 203 cell = numpy.array((a, b, c, alpha, beta, gamma), 'f') 204 spacegroup = mmcif_dict["_symmetry.space_group_name_H-M"] 205 spacegroup = spacegroup[1:-1] # get rid of quotes!! 206 if spacegroup is None: 207 raise Exception 208 structure_builder.set_symmetry(spacegroup, cell) 209 except Exception: 210 pass # no cell found, so just ignore
211 212
213 -class FastMMCIFParser(object):
214 """Parse an MMCIF file and return a Structure object.""" 215
216 - def __init__(self, structure_builder=None, QUIET=False):
217 """Create a FastMMCIFParser object. 218 219 The mmCIF parser calls a number of standard methods in an aggregated 220 StructureBuilder object. Normally this object is instanciated by the 221 parser object itself, but if the user provides his/her own 222 StructureBuilder object, the latter is used instead. 223 224 The main difference between this class and the regular MMCIFParser is 225 that only 'ATOM' and 'HETATM' lines are parsed here. Use if you are 226 interested only in coordinate information. 227 228 Arguments: 229 - structure_builder - an optional user implemented StructureBuilder class. 230 - QUIET - Evaluated as a Boolean. If true, warnings issued in constructing 231 the SMCRA data will be suppressed. If false (DEFAULT), they will be shown. 232 These warnings might be indicative of problems in the mmCIF file! 233 """ 234 if structure_builder is not None: 235 self._structure_builder = structure_builder 236 else: 237 self._structure_builder = StructureBuilder() 238 239 self.line_counter = 0 240 self.build_structure = None 241 self.QUIET = bool(QUIET)
242 243 # Public methods 244
245 - def get_structure(self, structure_id, filename):
246 """Return the structure. 247 248 Arguments: 249 - structure_id - string, the id that will be used for the structure 250 - filename - name of the mmCIF file OR an open filehandle 251 """ 252 with warnings.catch_warnings(): 253 if self.QUIET: 254 warnings.filterwarnings("ignore", category=PDBConstructionWarning) 255 with as_handle(filename) as handle: 256 self._build_structure(structure_id, handle) 257 258 return self._structure_builder.get_structure()
259 260 # Private methods 261
262 - def _build_structure(self, structure_id, filehandle):
263 264 # Read only _atom_site. and atom_site_anisotrop entries 265 read_atom, read_aniso = False, False 266 _fields, _records = [], [] 267 _anisof, _anisors = [], [] 268 for line in filehandle: 269 if line.startswith('_atom_site.'): 270 read_atom = True 271 _fields.append(line.strip()) 272 elif line.startswith('_atom_site_anisotrop.'): 273 read_aniso = True 274 _anisof.append(line.strip()) 275 elif read_atom and line.startswith('#'): 276 read_atom = False 277 elif read_aniso and line.startswith('#'): 278 read_aniso = False 279 elif read_atom: 280 _records.append(line.strip()) 281 elif read_aniso: 282 _anisors.append(line.strip()) 283 284 # Dumping the shlex module here since this particular 285 # category should be rather straightforward. 286 # Quite a performance boost.. 287 _record_tbl = zip(*map(str.split, _records)) 288 _anisob_tbl = zip(*map(str.split, _anisors)) 289 290 mmcif_dict = dict(zip(_fields, _record_tbl)) 291 mmcif_dict.update(dict(zip(_anisof, _anisob_tbl))) 292 293 # Build structure object 294 atom_id_list = mmcif_dict["_atom_site.label_atom_id"] 295 residue_id_list = mmcif_dict["_atom_site.label_comp_id"] 296 297 try: 298 element_list = mmcif_dict["_atom_site.type_symbol"] 299 except KeyError: 300 element_list = None 301 302 seq_id_list = mmcif_dict["_atom_site.label_seq_id"] 303 chain_id_list = mmcif_dict["_atom_site.auth_asym_id"] 304 305 x_list = [float(x) for x in mmcif_dict["_atom_site.Cartn_x"]] 306 y_list = [float(x) for x in mmcif_dict["_atom_site.Cartn_y"]] 307 z_list = [float(x) for x in mmcif_dict["_atom_site.Cartn_z"]] 308 alt_list = mmcif_dict["_atom_site.label_alt_id"] 309 icode_list = mmcif_dict["_atom_site.pdbx_PDB_ins_code"] 310 b_factor_list = mmcif_dict["_atom_site.B_iso_or_equiv"] 311 occupancy_list = mmcif_dict["_atom_site.occupancy"] 312 fieldname_list = mmcif_dict["_atom_site.group_PDB"] 313 314 try: 315 serial_list = [int(n) for n in mmcif_dict["_atom_site.pdbx_PDB_model_num"]] 316 except KeyError: 317 # No model number column 318 serial_list = None 319 except ValueError: 320 # Invalid model number (malformed file) 321 raise PDBConstructionException("Invalid model number") 322 323 try: 324 aniso_u11 = mmcif_dict["_atom_site.aniso_U[1][1]"] 325 aniso_u12 = mmcif_dict["_atom_site.aniso_U[1][2]"] 326 aniso_u13 = mmcif_dict["_atom_site.aniso_U[1][3]"] 327 aniso_u22 = mmcif_dict["_atom_site.aniso_U[2][2]"] 328 aniso_u23 = mmcif_dict["_atom_site.aniso_U[2][3]"] 329 aniso_u33 = mmcif_dict["_atom_site.aniso_U[3][3]"] 330 aniso_flag = 1 331 except KeyError: 332 # no anisotropic B factors 333 aniso_flag = 0 334 335 # if auth_seq_id is present, we use this. 336 # Otherwise label_seq_id is used. 337 if "_atom_site.auth_seq_id" in mmcif_dict: 338 seq_id_list = mmcif_dict["_atom_site.auth_seq_id"] 339 else: 340 seq_id_list = mmcif_dict["_atom_site.label_seq_id"] 341 342 # Now loop over atoms and build the structure 343 current_chain_id = None 344 current_residue_id = None 345 current_resname = None 346 structure_builder = self._structure_builder 347 structure_builder.init_structure(structure_id) 348 structure_builder.init_seg(" ") 349 350 # Historically, Biopython PDB parser uses model_id to mean array index 351 # so serial_id means the Model ID specified in the file 352 current_model_id = -1 353 current_serial_id = -1 354 for i in range(0, len(atom_id_list)): 355 356 # set the line_counter for 'ATOM' lines only and not 357 # as a global line counter found in the PDBParser() 358 # this number should match the '_atom_site.id' index in the MMCIF 359 structure_builder.set_line_counter(i) 360 361 x = x_list[i] 362 y = y_list[i] 363 z = z_list[i] 364 resname = residue_id_list[i] 365 chainid = chain_id_list[i] 366 altloc = alt_list[i] 367 if altloc == ".": 368 altloc = " " 369 int_resseq = int(seq_id_list[i]) 370 icode = icode_list[i] 371 if icode == "?": 372 icode = " " 373 name = atom_id_list[i].strip('"') # Remove occasional " from quoted atom names (e.g. xNA) 374 375 # occupancy & B factor 376 try: 377 tempfactor = float(b_factor_list[i]) 378 except ValueError: 379 raise PDBConstructionException("Invalid or missing B factor") 380 381 try: 382 occupancy = float(occupancy_list[i]) 383 except ValueError: 384 raise PDBConstructionException("Invalid or missing occupancy") 385 386 fieldname = fieldname_list[i] 387 if fieldname == "HETATM": 388 hetatm_flag = "H" 389 else: 390 hetatm_flag = " " 391 392 resseq = (hetatm_flag, int_resseq, icode) 393 394 if serial_list is not None: 395 # model column exists; use it 396 serial_id = serial_list[i] 397 if current_serial_id != serial_id: 398 # if serial changes, update it and start new model 399 current_serial_id = serial_id 400 current_model_id += 1 401 structure_builder.init_model(current_model_id, current_serial_id) 402 current_chain_id = None 403 current_residue_id = None 404 current_resname = None 405 else: 406 # no explicit model column; initialize single model 407 structure_builder.init_model(current_model_id) 408 409 if current_chain_id != chainid: 410 current_chain_id = chainid 411 structure_builder.init_chain(current_chain_id) 412 current_residue_id = None 413 current_resname = None 414 415 if current_residue_id != resseq or current_resname != resname: 416 current_residue_id = resseq 417 current_resname = resname 418 structure_builder.init_residue(resname, hetatm_flag, int_resseq, icode) 419 420 coord = numpy.array((x, y, z), 'f') 421 element = element_list[i] if element_list else None 422 structure_builder.init_atom(name, coord, tempfactor, occupancy, altloc, 423 name, element=element) 424 if aniso_flag == 1: 425 u = (aniso_u11[i], aniso_u12[i], aniso_u13[i], 426 aniso_u22[i], aniso_u23[i], aniso_u33[i]) 427 mapped_anisou = [float(x) for x in u] 428 anisou_array = numpy.array(mapped_anisou, 'f') 429 structure_builder.set_anisou(anisou_array)
430 431 if __name__ == "__main__": 432 import sys 433 434 if len(sys.argv) != 2: 435 print("Usage: python MMCIFparser.py filename") 436 raise SystemExit 437 filename = sys.argv[1] 438 439 p = MMCIFParser() 440 441 structure = p.get_structure("test", filename) 442 443 for model in structure.get_list(): 444 print(model) 445 for chain in model.get_list(): 446 print(chain) 447 print("Found %d residues." % len(chain.get_list())) 448