Package Bio :: Package PDB :: Module parse_pdb_header'
[hide private]
[frames] | no frames]

Source Code for Module Bio.PDB.parse_pdb_header'

  1  #!/usr/bin/env python 
  2  # 
  3  # parse_pdb_header.py 
  4  # parses header of PDB files into a python dictionary. 
  5  # emerged from the Columba database project www.columba-db.de. 
  6  # 
  7  # author: Kristian Rother 
  8  # 
  9  # license: same as BioPython, read LICENSE.TXT from current BioPython release. 
 10  # 
 11  # last modified: 9.2.2004 
 12  # 
 13  # Added some small changes: the whole PDB file is not read in anymore, but just 
 14  # until the first ATOM record (faster). I also split parse_pdb_header into 
 15  # parse_pdb_header and parse_pdb_header_list, because parse_pdb_header_list 
 16  # can be more easily reused in PDBParser. 
 17  # 
 18  # Thomas, 19/03/04 
 19  # 
 20  # Renamed some clearly private functions to _something (ie. parse_pdb_header_list 
 21  # is now _parse_pdb_header_list) 
 22  # Thomas 9/05/04 
 23   
 24  """Parse the header of a PDB file.""" 
 25   
 26  from __future__ import print_function 
 27   
 28  import re 
 29   
 30  from Bio import File 
 31   
 32   
33 -def _get_journal(inl):
34 # JRNL AUTH L.CHEN,M.DOI,F.S.MATHEWS,A.Y.CHISTOSERDOV, 2BBK 7 35 journal = "" 36 for l in inl: 37 if re.search("\AJRNL", l): 38 journal += l[19:72].lower() 39 journal = re.sub("\s\s+", " ", journal) 40 return journal
41 42
43 -def _get_references(inl):
44 # REMARK 1 REFERENCE 1 1CSE 11 45 # REMARK 1 AUTH W.BODE,E.PAPAMOKOS,D.MUSIL 1CSE 12 46 references = [] 47 actref = "" 48 for l in inl: 49 if re.search("\AREMARK 1", l): 50 if re.search("\AREMARK 1 REFERENCE", l): 51 if actref != "": 52 actref = re.sub("\s\s+", " ", actref) 53 if actref != " ": 54 references.append(actref) 55 actref = "" 56 else: 57 actref += l[19:72].lower() 58 59 if actref != "": 60 actref = re.sub("\s\s+", " ", actref) 61 if actref != " ": 62 references.append(actref) 63 return references
64 65 66 # bring dates to format: 1909-01-08
67 -def _format_date(pdb_date):
68 """Converts dates from DD-Mon-YY to YYYY-MM-DD format.""" 69 date = "" 70 year = int(pdb_date[7:]) 71 if year < 50: 72 century = 2000 73 else: 74 century = 1900 75 date = str(century + year) + "-" 76 all_months = ['xxx', 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 77 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'] 78 month = str(all_months.index(pdb_date[3:6])) 79 if len(month) == 1: 80 month = '0' + month 81 date = date + month + '-' + pdb_date[:2] 82 return date
83 84
85 -def _chop_end_codes(line):
86 """Chops lines ending with ' 1CSA 14' and the like.""" 87 return re.sub("\s\s\s\s+[\w]{4}.\s+\d*\Z", "", line)
88 89
90 -def _chop_end_misc(line):
91 """Chops lines ending with ' 14-JUL-97 1CSA' and the like.""" 92 return re.sub("\s\s\s\s+.*\Z", "", line)
93 94
95 -def _nice_case(line):
96 """Makes A Lowercase String With Capitals.""" 97 l = line.lower() 98 s = "" 99 i = 0 100 nextCap = 1 101 while i < len(l): 102 c = l[i] 103 if c >= 'a' and c <= 'z' and nextCap: 104 c = c.upper() 105 nextCap = 0 106 elif c == ' ' or c == '.' or c == ',' or c == ';' or c == ':' or c == '\t' or\ 107 c == '-' or c == '_': 108 nextCap = 1 109 s += c 110 i += 1 111 return s
112 113
114 -def parse_pdb_header(infile):
115 """ 116 Returns the header lines of a pdb file as a dictionary. 117 118 Dictionary keys are: head, deposition_date, release_date, structure_method, 119 resolution, structure_reference, journal_reference, author and 120 compound. 121 """ 122 header = [] 123 with File.as_handle(infile, 'r') as f: 124 for l in f: 125 record_type = l[0:6] 126 if record_type in ("ATOM ", "HETATM", "MODEL "): 127 break 128 else: 129 header.append(l) 130 return _parse_pdb_header_list(header)
131 132
133 -def _parse_pdb_header_list(header):
134 # database fields 135 dict = {'name': "", 136 'head': '', 137 'deposition_date': "1909-01-08", 138 'release_date': "1909-01-08", 139 'structure_method': "unknown", 140 'resolution': 0.0, 141 'structure_reference': "unknown", 142 'journal_reference': "unknown", 143 'author': "", 144 'compound': {'1': {'misc': ''}}, 'source': {'1': {'misc': ''}}} 145 146 dict['structure_reference'] = _get_references(header) 147 dict['journal_reference'] = _get_journal(header) 148 comp_molid = "1" 149 src_molid = "1" 150 last_comp_key = "misc" 151 last_src_key = "misc" 152 153 for hh in header: 154 h = re.sub("[\s\n\r]*\Z", "", hh) # chop linebreaks off 155 # key=re.sub("\s.+\s*","",h) 156 key = h[:6].strip() 157 # tail=re.sub("\A\w+\s+\d*\s*","",h) 158 tail = h[10:].strip() 159 # print("%s:%s" % (key, tail) 160 161 # From here, all the keys from the header are being parsed 162 if key == "TITLE": 163 name = _chop_end_codes(tail).lower() 164 if 'name' in dict: 165 dict['name'] += " " + name 166 else: 167 dict['name'] = name 168 elif key == "HEADER": 169 rr = re.search("\d\d-\w\w\w-\d\d", tail) 170 if rr is not None: 171 dict['deposition_date'] = _format_date(_nice_case(rr.group())) 172 head = _chop_end_misc(tail).lower() 173 dict['head'] = head 174 elif key == "COMPND": 175 tt = re.sub("\;\s*\Z", "", _chop_end_codes(tail)).lower() 176 # look for E.C. numbers in COMPND lines 177 rec = re.search('\d+\.\d+\.\d+\.\d+', tt) 178 if rec: 179 dict['compound'][comp_molid]['ec_number'] = rec.group() 180 tt = re.sub("\((e\.c\.)*\d+\.\d+\.\d+\.\d+\)", "", tt) 181 tok = tt.split(":") 182 if len(tok) >= 2: 183 ckey = tok[0] 184 cval = re.sub("\A\s*", "", tok[1]) 185 if ckey == 'mol_id': 186 dict['compound'][cval] = {'misc': ''} 187 comp_molid = cval 188 last_comp_key = "misc" 189 else: 190 dict['compound'][comp_molid][ckey] = cval 191 last_comp_key = ckey 192 else: 193 dict['compound'][comp_molid][last_comp_key] += tok[0] + " " 194 elif key == "SOURCE": 195 tt = re.sub("\;\s*\Z", "", _chop_end_codes(tail)).lower() 196 tok = tt.split(":") 197 # print(tok) 198 if len(tok) >= 2: 199 ckey = tok[0] 200 cval = re.sub("\A\s*", "", tok[1]) 201 if ckey == 'mol_id': 202 dict['source'][cval] = {'misc': ''} 203 comp_molid = cval 204 last_src_key = "misc" 205 else: 206 dict['source'][comp_molid][ckey] = cval 207 last_src_key = ckey 208 else: 209 dict['source'][comp_molid][last_src_key] += tok[0] + " " 210 elif key == "KEYWDS": 211 kwd = _chop_end_codes(tail).lower() 212 if 'keywords' in dict: 213 dict['keywords'] += " " + kwd 214 else: 215 dict['keywords'] = kwd 216 elif key == "EXPDTA": 217 expd = _chop_end_codes(tail) 218 # chop junk at end of lines for some structures 219 expd = re.sub('\s\s\s\s\s\s\s.*\Z', '', expd) 220 # if re.search('\Anmr',expd,re.IGNORECASE): expd='nmr' 221 # if re.search('x-ray diffraction',expd,re.IGNORECASE): expd='x-ray diffraction' 222 dict['structure_method'] = expd.lower() 223 elif key == "CAVEAT": 224 # make Annotation entries out of these!!! 225 pass 226 elif key == "REVDAT": 227 rr = re.search("\d\d-\w\w\w-\d\d", tail) 228 if rr is not None: 229 dict['release_date'] = _format_date(_nice_case(rr.group())) 230 elif key == "JRNL": 231 # print("%s:%s" % (key, tail)) 232 if 'journal' in dict: 233 dict['journal'] += tail 234 else: 235 dict['journal'] = tail 236 elif key == "AUTHOR": 237 auth = _nice_case(_chop_end_codes(tail)) 238 if 'author' in dict: 239 dict['author'] += auth 240 else: 241 dict['author'] = auth 242 elif key == "REMARK": 243 if re.search("REMARK 2 RESOLUTION.", hh): 244 r = _chop_end_codes(re.sub("REMARK 2 RESOLUTION.", '', hh)) 245 r = re.sub("\s+ANGSTROM.*", "", r) 246 try: 247 dict['resolution'] = float(r) 248 except: 249 # print('nonstandard resolution %r' % r) 250 dict['resolution'] = None 251 else: 252 # print(key) 253 pass 254 if dict['structure_method'] == 'unknown': 255 if dict['resolution'] > 0.0: 256 dict['structure_method'] = 'x-ray diffraction' 257 return dict
258 259 if __name__ == '__main__': 260 # Reads a PDB file passed as argument, parses its header, extracts 261 # some data and returns it as a dictionary. 262 import sys 263 filename = sys.argv[1] 264 with open(filename, 'r') as handle: 265 data_dict = parse_pdb_header(handle) 266 267 # print the dictionary 268 for k, y in data_dict.items(): 269 print("-" * 40) 270 print(k) 271 print(y) 272