Package Bio :: Package PDB :: Module MMCIFParser'
[hide private]
[frames] | no frames]

Source Code for Module Bio.PDB.MMCIFParser'

  1  # Copyright (C) 2002, Thomas Hamelryck (thamelry@binf.ku.dk) 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5   
  6  """mmCIF parsers.""" 
  7   
  8  from __future__ import print_function 
  9   
 10  import numpy 
 11  import warnings 
 12   
 13  from Bio.File import as_handle 
 14  from Bio._py3k import range 
 15   
 16  from Bio.PDB.MMCIF2Dict import MMCIF2Dict 
 17  from Bio.PDB.StructureBuilder import StructureBuilder 
 18  from Bio.PDB.PDBExceptions import PDBConstructionException 
 19  from Bio.PDB.PDBExceptions import PDBConstructionWarning 
 20   
 21   
22 -class MMCIFParser(object):
23 """Parse a mmCIF file and return a Structure object.""" 24
25 - def __init__(self, structure_builder=None, QUIET=False):
26 """Create a PDBParser object. 27 28 The mmCIF parser calls a number of standard methods in an aggregated 29 StructureBuilder object. Normally this object is instanciated by the 30 MMCIParser object itself, but if the user provides his/her own 31 StructureBuilder object, the latter is used instead. 32 33 Arguments: 34 - structure_builder - an optional user implemented StructureBuilder class. 35 - QUIET - Evaluated as a Boolean. If true, warnings issued in constructing 36 the SMCRA data will be suppressed. If false (DEFAULT), they will be shown. 37 These warnings might be indicative of problems in the mmCIF file! 38 39 """ 40 if structure_builder is not None: 41 self._structure_builder = structure_builder 42 else: 43 self._structure_builder = StructureBuilder() 44 # self.header = None 45 # self.trailer = None 46 self.line_counter = 0 47 self.build_structure = None 48 self.QUIET = bool(QUIET)
49 50 # Public methods 51
52 - def get_structure(self, structure_id, filename):
53 """Return the structure. 54 55 Arguments: 56 - structure_id - string, the id that will be used for the structure 57 - filename - name of mmCIF file, OR an open text mode file handle 58 59 """ 60 with warnings.catch_warnings(): 61 if self.QUIET: 62 warnings.filterwarnings("ignore", category=PDBConstructionWarning) 63 self._mmcif_dict = MMCIF2Dict(filename) 64 self._build_structure(structure_id) 65 66 return self._structure_builder.get_structure()
67 68 # Private methods 69
70 - def _build_structure(self, structure_id):
71 72 # two special chars as placeholders in the mmCIF format 73 # for item values that cannot be explicitly assigned 74 # see: pdbx/mmcif syntax web page 75 _unassigned = set(('.', '?')) 76 77 mmcif_dict = self._mmcif_dict 78 atom_id_list = mmcif_dict["_atom_site.label_atom_id"] 79 residue_id_list = mmcif_dict["_atom_site.label_comp_id"] 80 try: 81 element_list = mmcif_dict["_atom_site.type_symbol"] 82 except KeyError: 83 element_list = None 84 chain_id_list = mmcif_dict["_atom_site.auth_asym_id"] 85 x_list = [float(x) for x in mmcif_dict["_atom_site.Cartn_x"]] 86 y_list = [float(x) for x in mmcif_dict["_atom_site.Cartn_y"]] 87 z_list = [float(x) for x in mmcif_dict["_atom_site.Cartn_z"]] 88 alt_list = mmcif_dict["_atom_site.label_alt_id"] 89 icode_list = mmcif_dict["_atom_site.pdbx_PDB_ins_code"] 90 b_factor_list = mmcif_dict["_atom_site.B_iso_or_equiv"] 91 occupancy_list = mmcif_dict["_atom_site.occupancy"] 92 fieldname_list = mmcif_dict["_atom_site.group_PDB"] 93 try: 94 serial_list = [int(n) for n in mmcif_dict["_atom_site.pdbx_PDB_model_num"]] 95 except KeyError: 96 # No model number column 97 serial_list = None 98 except ValueError: 99 # Invalid model number (malformed file) 100 raise PDBConstructionException("Invalid model number") 101 try: 102 aniso_u11 = mmcif_dict["_atom_site.aniso_U[1][1]"] 103 aniso_u12 = mmcif_dict["_atom_site.aniso_U[1][2]"] 104 aniso_u13 = mmcif_dict["_atom_site.aniso_U[1][3]"] 105 aniso_u22 = mmcif_dict["_atom_site.aniso_U[2][2]"] 106 aniso_u23 = mmcif_dict["_atom_site.aniso_U[2][3]"] 107 aniso_u33 = mmcif_dict["_atom_site.aniso_U[3][3]"] 108 aniso_flag = 1 109 except KeyError: 110 # no anisotropic B factors 111 aniso_flag = 0 112 # if auth_seq_id is present, we use this. 113 # Otherwise label_seq_id is used. 114 if "_atom_site.auth_seq_id" in mmcif_dict: 115 seq_id_list = mmcif_dict["_atom_site.auth_seq_id"] 116 else: 117 seq_id_list = mmcif_dict["_atom_site.label_seq_id"] 118 # Now loop over atoms and build the structure 119 current_chain_id = None 120 current_residue_id = None 121 current_resname = None 122 structure_builder = self._structure_builder 123 structure_builder.init_structure(structure_id) 124 structure_builder.init_seg(" ") 125 # Historically, Biopython PDB parser uses model_id to mean array index 126 # so serial_id means the Model ID specified in the file 127 current_model_id = -1 128 current_serial_id = -1 129 for i in range(0, len(atom_id_list)): 130 131 # set the line_counter for 'ATOM' lines only and not 132 # as a global line counter found in the PDBParser() 133 # this number should match the '_atom_site.id' index in the MMCIF 134 structure_builder.set_line_counter(i) 135 136 x = x_list[i] 137 y = y_list[i] 138 z = z_list[i] 139 resname = residue_id_list[i] 140 chainid = chain_id_list[i] 141 altloc = alt_list[i] 142 if altloc in _unassigned: 143 altloc = " " 144 int_resseq = int(seq_id_list[i]) 145 icode = icode_list[i] 146 if icode in _unassigned: 147 icode = " " 148 name = atom_id_list[i] 149 # occupancy & B factor 150 try: 151 tempfactor = float(b_factor_list[i]) 152 except ValueError: 153 raise PDBConstructionException("Invalid or missing B factor") 154 try: 155 occupancy = float(occupancy_list[i]) 156 except ValueError: 157 raise PDBConstructionException("Invalid or missing occupancy") 158 fieldname = fieldname_list[i] 159 if fieldname == "HETATM": 160 if resname == "HOH" or resname == "WAT": 161 hetatm_flag = "W" 162 else: 163 hetatm_flag = "H" 164 else: 165 hetatm_flag = " " 166 167 resseq = (hetatm_flag, int_resseq, icode) 168 169 if serial_list is not None: 170 # model column exists; use it 171 serial_id = serial_list[i] 172 if current_serial_id != serial_id: 173 # if serial changes, update it and start new model 174 current_serial_id = serial_id 175 current_model_id += 1 176 structure_builder.init_model(current_model_id, current_serial_id) 177 current_chain_id = None 178 current_residue_id = None 179 current_resname = None 180 else: 181 # no explicit model column; initialize single model 182 structure_builder.init_model(current_model_id) 183 184 if current_chain_id != chainid: 185 current_chain_id = chainid 186 structure_builder.init_chain(current_chain_id) 187 current_residue_id = None 188 current_resname = None 189 190 if current_residue_id != resseq or current_resname != resname: 191 current_residue_id = resseq 192 current_resname = resname 193 structure_builder.init_residue(resname, hetatm_flag, int_resseq, icode) 194 195 coord = numpy.array((x, y, z), 'f') 196 element = element_list[i].upper() if element_list else None 197 structure_builder.init_atom(name, coord, tempfactor, occupancy, altloc, 198 name, element=element) 199 if aniso_flag == 1: 200 u = (aniso_u11[i], aniso_u12[i], aniso_u13[i], 201 aniso_u22[i], aniso_u23[i], aniso_u33[i]) 202 mapped_anisou = [float(_) for _ in u] 203 anisou_array = numpy.array(mapped_anisou, 'f') 204 structure_builder.set_anisou(anisou_array) 205 # Now try to set the cell 206 try: 207 a = float(mmcif_dict["_cell.length_a"]) 208 b = float(mmcif_dict["_cell.length_b"]) 209 c = float(mmcif_dict["_cell.length_c"]) 210 alpha = float(mmcif_dict["_cell.angle_alpha"]) 211 beta = float(mmcif_dict["_cell.angle_beta"]) 212 gamma = float(mmcif_dict["_cell.angle_gamma"]) 213 cell = numpy.array((a, b, c, alpha, beta, gamma), 'f') 214 spacegroup = mmcif_dict["_symmetry.space_group_name_H-M"] 215 spacegroup = spacegroup[1:-1] # get rid of quotes!! 216 if spacegroup is None: 217 raise Exception 218 structure_builder.set_symmetry(spacegroup, cell) 219 except Exception: 220 pass # no cell found, so just ignore
221 222
223 -class FastMMCIFParser(object):
224 """Parse an MMCIF file and return a Structure object.""" 225
226 - def __init__(self, structure_builder=None, QUIET=False):
227 """Create a FastMMCIFParser object. 228 229 The mmCIF parser calls a number of standard methods in an aggregated 230 StructureBuilder object. Normally this object is instanciated by the 231 parser object itself, but if the user provides his/her own 232 StructureBuilder object, the latter is used instead. 233 234 The main difference between this class and the regular MMCIFParser is 235 that only 'ATOM' and 'HETATM' lines are parsed here. Use if you are 236 interested only in coordinate information. 237 238 Arguments: 239 - structure_builder - an optional user implemented StructureBuilder class. 240 - QUIET - Evaluated as a Boolean. If true, warnings issued in constructing 241 the SMCRA data will be suppressed. If false (DEFAULT), they will be shown. 242 These warnings might be indicative of problems in the mmCIF file! 243 244 """ 245 if structure_builder is not None: 246 self._structure_builder = structure_builder 247 else: 248 self._structure_builder = StructureBuilder() 249 250 self.line_counter = 0 251 self.build_structure = None 252 self.QUIET = bool(QUIET)
253 254 # Public methods 255
256 - def get_structure(self, structure_id, filename):
257 """Return the structure. 258 259 Arguments: 260 - structure_id - string, the id that will be used for the structure 261 - filename - name of the mmCIF file OR an open filehandle 262 263 """ 264 with warnings.catch_warnings(): 265 if self.QUIET: 266 warnings.filterwarnings("ignore", category=PDBConstructionWarning) 267 with as_handle(filename) as handle: 268 self._build_structure(structure_id, handle) 269 270 return self._structure_builder.get_structure()
271 272 # Private methods 273
274 - def _build_structure(self, structure_id, filehandle):
275 276 # two special chars as placeholders in the mmCIF format 277 # for item values that cannot be explicitly assigned 278 # see: pdbx/mmcif syntax web page 279 _unassigned = set(('.', '?')) 280 281 # Read only _atom_site. and atom_site_anisotrop entries 282 read_atom, read_aniso = False, False 283 _fields, _records = [], [] 284 _anisof, _anisors = [], [] 285 for line in filehandle: 286 if line.startswith('_atom_site.'): 287 read_atom = True 288 _fields.append(line.strip()) 289 elif line.startswith('_atom_site_anisotrop.'): 290 read_aniso = True 291 _anisof.append(line.strip()) 292 elif read_atom and line.startswith('#'): 293 read_atom = False 294 elif read_aniso and line.startswith('#'): 295 read_aniso = False 296 elif read_atom: 297 _records.append(line.strip()) 298 elif read_aniso: 299 _anisors.append(line.strip()) 300 301 # Dumping the shlex module here since this particular 302 # category should be rather straightforward. 303 # Quite a performance boost.. 304 _record_tbl = zip(*map(str.split, _records)) 305 _anisob_tbl = zip(*map(str.split, _anisors)) 306 307 mmcif_dict = dict(zip(_fields, _record_tbl)) 308 mmcif_dict.update(dict(zip(_anisof, _anisob_tbl))) 309 310 # Build structure object 311 atom_id_list = mmcif_dict["_atom_site.label_atom_id"] 312 residue_id_list = mmcif_dict["_atom_site.label_comp_id"] 313 314 try: 315 element_list = mmcif_dict["_atom_site.type_symbol"] 316 except KeyError: 317 element_list = None 318 319 chain_id_list = mmcif_dict["_atom_site.auth_asym_id"] 320 321 x_list = [float(x) for x in mmcif_dict["_atom_site.Cartn_x"]] 322 y_list = [float(x) for x in mmcif_dict["_atom_site.Cartn_y"]] 323 z_list = [float(x) for x in mmcif_dict["_atom_site.Cartn_z"]] 324 alt_list = mmcif_dict["_atom_site.label_alt_id"] 325 icode_list = mmcif_dict["_atom_site.pdbx_PDB_ins_code"] 326 b_factor_list = mmcif_dict["_atom_site.B_iso_or_equiv"] 327 occupancy_list = mmcif_dict["_atom_site.occupancy"] 328 fieldname_list = mmcif_dict["_atom_site.group_PDB"] 329 330 try: 331 serial_list = [int(n) for n in mmcif_dict["_atom_site.pdbx_PDB_model_num"]] 332 except KeyError: 333 # No model number column 334 serial_list = None 335 except ValueError: 336 # Invalid model number (malformed file) 337 raise PDBConstructionException("Invalid model number") 338 339 try: 340 aniso_u11 = mmcif_dict["_atom_site.aniso_U[1][1]"] 341 aniso_u12 = mmcif_dict["_atom_site.aniso_U[1][2]"] 342 aniso_u13 = mmcif_dict["_atom_site.aniso_U[1][3]"] 343 aniso_u22 = mmcif_dict["_atom_site.aniso_U[2][2]"] 344 aniso_u23 = mmcif_dict["_atom_site.aniso_U[2][3]"] 345 aniso_u33 = mmcif_dict["_atom_site.aniso_U[3][3]"] 346 aniso_flag = 1 347 except KeyError: 348 # no anisotropic B factors 349 aniso_flag = 0 350 351 # if auth_seq_id is present, we use this. 352 # Otherwise label_seq_id is used. 353 if "_atom_site.auth_seq_id" in mmcif_dict: 354 seq_id_list = mmcif_dict["_atom_site.auth_seq_id"] 355 else: 356 seq_id_list = mmcif_dict["_atom_site.label_seq_id"] 357 358 # Now loop over atoms and build the structure 359 current_chain_id = None 360 current_residue_id = None 361 current_resname = None 362 structure_builder = self._structure_builder 363 structure_builder.init_structure(structure_id) 364 structure_builder.init_seg(" ") 365 366 # Historically, Biopython PDB parser uses model_id to mean array index 367 # so serial_id means the Model ID specified in the file 368 current_model_id = -1 369 current_serial_id = -1 370 for i in range(0, len(atom_id_list)): 371 372 # set the line_counter for 'ATOM' lines only and not 373 # as a global line counter found in the PDBParser() 374 # this number should match the '_atom_site.id' index in the MMCIF 375 structure_builder.set_line_counter(i) 376 377 x = x_list[i] 378 y = y_list[i] 379 z = z_list[i] 380 resname = residue_id_list[i] 381 chainid = chain_id_list[i] 382 altloc = alt_list[i] 383 if altloc in _unassigned: 384 altloc = " " 385 int_resseq = int(seq_id_list[i]) 386 icode = icode_list[i] 387 if icode in _unassigned: 388 icode = " " 389 name = atom_id_list[i].strip('"') # Remove occasional " from quoted atom names (e.g. xNA) 390 391 # occupancy & B factor 392 try: 393 tempfactor = float(b_factor_list[i]) 394 except ValueError: 395 raise PDBConstructionException("Invalid or missing B factor") 396 397 try: 398 occupancy = float(occupancy_list[i]) 399 except ValueError: 400 raise PDBConstructionException("Invalid or missing occupancy") 401 402 fieldname = fieldname_list[i] 403 if fieldname == "HETATM": 404 hetatm_flag = "H" 405 else: 406 hetatm_flag = " " 407 408 resseq = (hetatm_flag, int_resseq, icode) 409 410 if serial_list is not None: 411 # model column exists; use it 412 serial_id = serial_list[i] 413 if current_serial_id != serial_id: 414 # if serial changes, update it and start new model 415 current_serial_id = serial_id 416 current_model_id += 1 417 structure_builder.init_model(current_model_id, current_serial_id) 418 current_chain_id = None 419 current_residue_id = None 420 current_resname = None 421 else: 422 # no explicit model column; initialize single model 423 structure_builder.init_model(current_model_id) 424 425 if current_chain_id != chainid: 426 current_chain_id = chainid 427 structure_builder.init_chain(current_chain_id) 428 current_residue_id = None 429 current_resname = None 430 431 if current_residue_id != resseq or current_resname != resname: 432 current_residue_id = resseq 433 current_resname = resname 434 structure_builder.init_residue(resname, hetatm_flag, int_resseq, icode) 435 436 coord = numpy.array((x, y, z), 'f') 437 element = element_list[i] if element_list else None 438 structure_builder.init_atom(name, coord, tempfactor, occupancy, altloc, 439 name, element=element) 440 if aniso_flag == 1: 441 u = (aniso_u11[i], aniso_u12[i], aniso_u13[i], 442 aniso_u22[i], aniso_u23[i], aniso_u33[i]) 443 mapped_anisou = [float(_) for _ in u] 444 anisou_array = numpy.array(mapped_anisou, 'f') 445 structure_builder.set_anisou(anisou_array)
446