Package Bio :: Package PDB :: Module MMCIFParser'
[hide private]
[frames] | no frames]

Source Code for Module Bio.PDB.MMCIFParser'

  1  # Copyright (C) 2002, Thomas Hamelryck (thamelry@binf.ku.dk) 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5   
  6  """mmCIF parsers.""" 
  7   
  8  from __future__ import print_function 
  9   
 10  import numpy 
 11  import warnings 
 12   
 13  from Bio.File import as_handle 
 14  from Bio._py3k import range 
 15   
 16  from Bio.PDB.MMCIF2Dict import MMCIF2Dict 
 17  from Bio.PDB.StructureBuilder import StructureBuilder 
 18  from Bio.PDB.PDBExceptions import PDBConstructionException 
 19  from Bio.PDB.PDBExceptions import PDBConstructionWarning 
 20   
 21   
22 -class MMCIFParser(object):
23 """Parse a mmCIF file and return a Structure object.""" 24
25 - def __init__(self, structure_builder=None, QUIET=False):
26 """Create a PDBParser object. 27 28 The mmCIF parser calls a number of standard methods in an aggregated 29 StructureBuilder object. Normally this object is instanciated by the 30 MMCIParser object itself, but if the user provides his/her own 31 StructureBuilder object, the latter is used instead. 32 33 Arguments: 34 - structure_builder - an optional user implemented StructureBuilder class. 35 - QUIET - Evaluated as a Boolean. If true, warnings issued in constructing 36 the SMCRA data will be suppressed. If false (DEFAULT), they will be shown. 37 These warnings might be indicative of problems in the mmCIF file! 38 39 """ 40 if structure_builder is not None: 41 self._structure_builder = structure_builder 42 else: 43 self._structure_builder = StructureBuilder() 44 # self.header = None 45 # self.trailer = None 46 self.line_counter = 0 47 self.build_structure = None 48 self.QUIET = bool(QUIET)
49 50 # Public methods 51
52 - def get_structure(self, structure_id, filename):
53 """Return the structure. 54 55 Arguments: 56 - structure_id - string, the id that will be used for the structure 57 - filename - name of the mmCIF file OR an open filehandle 58 59 """ 60 with warnings.catch_warnings(): 61 if self.QUIET: 62 warnings.filterwarnings("ignore", category=PDBConstructionWarning) 63 self._mmcif_dict = MMCIF2Dict(filename) 64 self._build_structure(structure_id) 65 66 return self._structure_builder.get_structure()
67 68 # Private methods 69
70 - def _build_structure(self, structure_id):
71 mmcif_dict = self._mmcif_dict 72 atom_id_list = mmcif_dict["_atom_site.label_atom_id"] 73 residue_id_list = mmcif_dict["_atom_site.label_comp_id"] 74 try: 75 element_list = mmcif_dict["_atom_site.type_symbol"] 76 except KeyError: 77 element_list = None 78 seq_id_list = mmcif_dict["_atom_site.label_seq_id"] 79 chain_id_list = mmcif_dict["_atom_site.auth_asym_id"] 80 x_list = [float(x) for x in mmcif_dict["_atom_site.Cartn_x"]] 81 y_list = [float(x) for x in mmcif_dict["_atom_site.Cartn_y"]] 82 z_list = [float(x) for x in mmcif_dict["_atom_site.Cartn_z"]] 83 alt_list = mmcif_dict["_atom_site.label_alt_id"] 84 icode_list = mmcif_dict["_atom_site.pdbx_PDB_ins_code"] 85 b_factor_list = mmcif_dict["_atom_site.B_iso_or_equiv"] 86 occupancy_list = mmcif_dict["_atom_site.occupancy"] 87 fieldname_list = mmcif_dict["_atom_site.group_PDB"] 88 try: 89 serial_list = [int(n) for n in mmcif_dict["_atom_site.pdbx_PDB_model_num"]] 90 except KeyError: 91 # No model number column 92 serial_list = None 93 except ValueError: 94 # Invalid model number (malformed file) 95 raise PDBConstructionException("Invalid model number") 96 try: 97 aniso_u11 = mmcif_dict["_atom_site.aniso_U[1][1]"] 98 aniso_u12 = mmcif_dict["_atom_site.aniso_U[1][2]"] 99 aniso_u13 = mmcif_dict["_atom_site.aniso_U[1][3]"] 100 aniso_u22 = mmcif_dict["_atom_site.aniso_U[2][2]"] 101 aniso_u23 = mmcif_dict["_atom_site.aniso_U[2][3]"] 102 aniso_u33 = mmcif_dict["_atom_site.aniso_U[3][3]"] 103 aniso_flag = 1 104 except KeyError: 105 # no anisotropic B factors 106 aniso_flag = 0 107 # if auth_seq_id is present, we use this. 108 # Otherwise label_seq_id is used. 109 if "_atom_site.auth_seq_id" in mmcif_dict: 110 seq_id_list = mmcif_dict["_atom_site.auth_seq_id"] 111 else: 112 seq_id_list = mmcif_dict["_atom_site.label_seq_id"] 113 # Now loop over atoms and build the structure 114 current_chain_id = None 115 current_residue_id = None 116 current_resname = None 117 structure_builder = self._structure_builder 118 structure_builder.init_structure(structure_id) 119 structure_builder.init_seg(" ") 120 # Historically, Biopython PDB parser uses model_id to mean array index 121 # so serial_id means the Model ID specified in the file 122 current_model_id = -1 123 current_serial_id = -1 124 for i in range(0, len(atom_id_list)): 125 126 # set the line_counter for 'ATOM' lines only and not 127 # as a global line counter found in the PDBParser() 128 # this number should match the '_atom_site.id' index in the MMCIF 129 structure_builder.set_line_counter(i) 130 131 x = x_list[i] 132 y = y_list[i] 133 z = z_list[i] 134 resname = residue_id_list[i] 135 chainid = chain_id_list[i] 136 altloc = alt_list[i] 137 if altloc == ".": 138 altloc = " " 139 int_resseq = int(seq_id_list[i]) 140 icode = icode_list[i] 141 if icode == "?": 142 icode = " " 143 name = atom_id_list[i] 144 # occupancy & B factor 145 try: 146 tempfactor = float(b_factor_list[i]) 147 except ValueError: 148 raise PDBConstructionException("Invalid or missing B factor") 149 try: 150 occupancy = float(occupancy_list[i]) 151 except ValueError: 152 raise PDBConstructionException("Invalid or missing occupancy") 153 fieldname = fieldname_list[i] 154 if fieldname == "HETATM": 155 if resname == "HOH" or resname == "WAT": 156 hetatm_flag = "W" 157 else: 158 hetatm_flag = "H" 159 else: 160 hetatm_flag = " " 161 162 resseq = (hetatm_flag, int_resseq, icode) 163 164 if serial_list is not None: 165 # model column exists; use it 166 serial_id = serial_list[i] 167 if current_serial_id != serial_id: 168 # if serial changes, update it and start new model 169 current_serial_id = serial_id 170 current_model_id += 1 171 structure_builder.init_model(current_model_id, current_serial_id) 172 current_chain_id = None 173 current_residue_id = None 174 current_resname = None 175 else: 176 # no explicit model column; initialize single model 177 structure_builder.init_model(current_model_id) 178 179 if current_chain_id != chainid: 180 current_chain_id = chainid 181 structure_builder.init_chain(current_chain_id) 182 current_residue_id = None 183 current_resname = None 184 185 if current_residue_id != resseq or current_resname != resname: 186 current_residue_id = resseq 187 current_resname = resname 188 structure_builder.init_residue(resname, hetatm_flag, int_resseq, icode) 189 190 coord = numpy.array((x, y, z), 'f') 191 element = element_list[i] if element_list else None 192 structure_builder.init_atom(name, coord, tempfactor, occupancy, altloc, 193 name, element=element) 194 if aniso_flag == 1: 195 u = (aniso_u11[i], aniso_u12[i], aniso_u13[i], 196 aniso_u22[i], aniso_u23[i], aniso_u33[i]) 197 mapped_anisou = [float(x) for x in u] 198 anisou_array = numpy.array(mapped_anisou, 'f') 199 structure_builder.set_anisou(anisou_array) 200 # Now try to set the cell 201 try: 202 a = float(mmcif_dict["_cell.length_a"]) 203 b = float(mmcif_dict["_cell.length_b"]) 204 c = float(mmcif_dict["_cell.length_c"]) 205 alpha = float(mmcif_dict["_cell.angle_alpha"]) 206 beta = float(mmcif_dict["_cell.angle_beta"]) 207 gamma = float(mmcif_dict["_cell.angle_gamma"]) 208 cell = numpy.array((a, b, c, alpha, beta, gamma), 'f') 209 spacegroup = mmcif_dict["_symmetry.space_group_name_H-M"] 210 spacegroup = spacegroup[1:-1] # get rid of quotes!! 211 if spacegroup is None: 212 raise Exception 213 structure_builder.set_symmetry(spacegroup, cell) 214 except Exception: 215 pass # no cell found, so just ignore
216 217
218 -class FastMMCIFParser(object):
219 """Parse an MMCIF file and return a Structure object.""" 220
221 - def __init__(self, structure_builder=None, QUIET=False):
222 """Create a FastMMCIFParser object. 223 224 The mmCIF parser calls a number of standard methods in an aggregated 225 StructureBuilder object. Normally this object is instanciated by the 226 parser object itself, but if the user provides his/her own 227 StructureBuilder object, the latter is used instead. 228 229 The main difference between this class and the regular MMCIFParser is 230 that only 'ATOM' and 'HETATM' lines are parsed here. Use if you are 231 interested only in coordinate information. 232 233 Arguments: 234 - structure_builder - an optional user implemented StructureBuilder class. 235 - QUIET - Evaluated as a Boolean. If true, warnings issued in constructing 236 the SMCRA data will be suppressed. If false (DEFAULT), they will be shown. 237 These warnings might be indicative of problems in the mmCIF file! 238 239 """ 240 if structure_builder is not None: 241 self._structure_builder = structure_builder 242 else: 243 self._structure_builder = StructureBuilder() 244 245 self.line_counter = 0 246 self.build_structure = None 247 self.QUIET = bool(QUIET)
248 249 # Public methods 250
251 - def get_structure(self, structure_id, filename):
252 """Return the structure. 253 254 Arguments: 255 - structure_id - string, the id that will be used for the structure 256 - filename - name of the mmCIF file OR an open filehandle 257 258 """ 259 with warnings.catch_warnings(): 260 if self.QUIET: 261 warnings.filterwarnings("ignore", category=PDBConstructionWarning) 262 with as_handle(filename) as handle: 263 self._build_structure(structure_id, handle) 264 265 return self._structure_builder.get_structure()
266 267 # Private methods 268
269 - def _build_structure(self, structure_id, filehandle):
270 271 # Read only _atom_site. and atom_site_anisotrop entries 272 read_atom, read_aniso = False, False 273 _fields, _records = [], [] 274 _anisof, _anisors = [], [] 275 for line in filehandle: 276 if line.startswith('_atom_site.'): 277 read_atom = True 278 _fields.append(line.strip()) 279 elif line.startswith('_atom_site_anisotrop.'): 280 read_aniso = True 281 _anisof.append(line.strip()) 282 elif read_atom and line.startswith('#'): 283 read_atom = False 284 elif read_aniso and line.startswith('#'): 285 read_aniso = False 286 elif read_atom: 287 _records.append(line.strip()) 288 elif read_aniso: 289 _anisors.append(line.strip()) 290 291 # Dumping the shlex module here since this particular 292 # category should be rather straightforward. 293 # Quite a performance boost.. 294 _record_tbl = zip(*map(str.split, _records)) 295 _anisob_tbl = zip(*map(str.split, _anisors)) 296 297 mmcif_dict = dict(zip(_fields, _record_tbl)) 298 mmcif_dict.update(dict(zip(_anisof, _anisob_tbl))) 299 300 # Build structure object 301 atom_id_list = mmcif_dict["_atom_site.label_atom_id"] 302 residue_id_list = mmcif_dict["_atom_site.label_comp_id"] 303 304 try: 305 element_list = mmcif_dict["_atom_site.type_symbol"] 306 except KeyError: 307 element_list = None 308 309 seq_id_list = mmcif_dict["_atom_site.label_seq_id"] 310 chain_id_list = mmcif_dict["_atom_site.auth_asym_id"] 311 312 x_list = [float(x) for x in mmcif_dict["_atom_site.Cartn_x"]] 313 y_list = [float(x) for x in mmcif_dict["_atom_site.Cartn_y"]] 314 z_list = [float(x) for x in mmcif_dict["_atom_site.Cartn_z"]] 315 alt_list = mmcif_dict["_atom_site.label_alt_id"] 316 icode_list = mmcif_dict["_atom_site.pdbx_PDB_ins_code"] 317 b_factor_list = mmcif_dict["_atom_site.B_iso_or_equiv"] 318 occupancy_list = mmcif_dict["_atom_site.occupancy"] 319 fieldname_list = mmcif_dict["_atom_site.group_PDB"] 320 321 try: 322 serial_list = [int(n) for n in mmcif_dict["_atom_site.pdbx_PDB_model_num"]] 323 except KeyError: 324 # No model number column 325 serial_list = None 326 except ValueError: 327 # Invalid model number (malformed file) 328 raise PDBConstructionException("Invalid model number") 329 330 try: 331 aniso_u11 = mmcif_dict["_atom_site.aniso_U[1][1]"] 332 aniso_u12 = mmcif_dict["_atom_site.aniso_U[1][2]"] 333 aniso_u13 = mmcif_dict["_atom_site.aniso_U[1][3]"] 334 aniso_u22 = mmcif_dict["_atom_site.aniso_U[2][2]"] 335 aniso_u23 = mmcif_dict["_atom_site.aniso_U[2][3]"] 336 aniso_u33 = mmcif_dict["_atom_site.aniso_U[3][3]"] 337 aniso_flag = 1 338 except KeyError: 339 # no anisotropic B factors 340 aniso_flag = 0 341 342 # if auth_seq_id is present, we use this. 343 # Otherwise label_seq_id is used. 344 if "_atom_site.auth_seq_id" in mmcif_dict: 345 seq_id_list = mmcif_dict["_atom_site.auth_seq_id"] 346 else: 347 seq_id_list = mmcif_dict["_atom_site.label_seq_id"] 348 349 # Now loop over atoms and build the structure 350 current_chain_id = None 351 current_residue_id = None 352 current_resname = None 353 structure_builder = self._structure_builder 354 structure_builder.init_structure(structure_id) 355 structure_builder.init_seg(" ") 356 357 # Historically, Biopython PDB parser uses model_id to mean array index 358 # so serial_id means the Model ID specified in the file 359 current_model_id = -1 360 current_serial_id = -1 361 for i in range(0, len(atom_id_list)): 362 363 # set the line_counter for 'ATOM' lines only and not 364 # as a global line counter found in the PDBParser() 365 # this number should match the '_atom_site.id' index in the MMCIF 366 structure_builder.set_line_counter(i) 367 368 x = x_list[i] 369 y = y_list[i] 370 z = z_list[i] 371 resname = residue_id_list[i] 372 chainid = chain_id_list[i] 373 altloc = alt_list[i] 374 if altloc == ".": 375 altloc = " " 376 int_resseq = int(seq_id_list[i]) 377 icode = icode_list[i] 378 if icode == "?": 379 icode = " " 380 name = atom_id_list[i].strip('"') # Remove occasional " from quoted atom names (e.g. xNA) 381 382 # occupancy & B factor 383 try: 384 tempfactor = float(b_factor_list[i]) 385 except ValueError: 386 raise PDBConstructionException("Invalid or missing B factor") 387 388 try: 389 occupancy = float(occupancy_list[i]) 390 except ValueError: 391 raise PDBConstructionException("Invalid or missing occupancy") 392 393 fieldname = fieldname_list[i] 394 if fieldname == "HETATM": 395 hetatm_flag = "H" 396 else: 397 hetatm_flag = " " 398 399 resseq = (hetatm_flag, int_resseq, icode) 400 401 if serial_list is not None: 402 # model column exists; use it 403 serial_id = serial_list[i] 404 if current_serial_id != serial_id: 405 # if serial changes, update it and start new model 406 current_serial_id = serial_id 407 current_model_id += 1 408 structure_builder.init_model(current_model_id, current_serial_id) 409 current_chain_id = None 410 current_residue_id = None 411 current_resname = None 412 else: 413 # no explicit model column; initialize single model 414 structure_builder.init_model(current_model_id) 415 416 if current_chain_id != chainid: 417 current_chain_id = chainid 418 structure_builder.init_chain(current_chain_id) 419 current_residue_id = None 420 current_resname = None 421 422 if current_residue_id != resseq or current_resname != resname: 423 current_residue_id = resseq 424 current_resname = resname 425 structure_builder.init_residue(resname, hetatm_flag, int_resseq, icode) 426 427 coord = numpy.array((x, y, z), 'f') 428 element = element_list[i] if element_list else None 429 structure_builder.init_atom(name, coord, tempfactor, occupancy, altloc, 430 name, element=element) 431 if aniso_flag == 1: 432 u = (aniso_u11[i], aniso_u12[i], aniso_u13[i], 433 aniso_u22[i], aniso_u23[i], aniso_u33[i]) 434 mapped_anisou = [float(x) for x in u] 435 anisou_array = numpy.array(mapped_anisou, 'f') 436 structure_builder.set_anisou(anisou_array)
437