Package Bio :: Package SeqIO :: Module PdbIO
[hide private]
[frames] | no frames]

Source Code for Module Bio.SeqIO.PdbIO

  1  # Copyright 2012 by Eric Talevich.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5  import collections 
  6  import warnings 
  7   
  8  from Bio import BiopythonWarning 
  9  from Bio.Alphabet import generic_protein 
 10  from Bio.Seq import Seq 
 11  from Bio.SeqRecord import SeqRecord 
 12  from Bio.Data.SCOPData import protein_letters_3to1 
 13   
 14   
15 -def PdbSeqresIterator(handle):
16 """Returns SeqRecord objects for each chain in a PDB file. 17 18 The sequences are derived from the SEQRES lines in the 19 PDB file header, not the atoms of the 3D structure. 20 21 Specifically, these PDB records are handled: DBREF, SEQADV, SEQRES, MODRES 22 23 See: http://www.wwpdb.org/documentation/format23/sect3.html 24 25 This gets called internally via Bio.SeqIO for the SEQRES based interpretation 26 of the PDB file format: 27 28 >>> from Bio import SeqIO 29 >>> for record in SeqIO.parse("PDB/1A8O.pdb", "pdb-seqres"): 30 ... print("Record id %s, chain %s" % (record.id, record.annotations["chain"])) 31 ... print(record.dbxrefs) 32 ... 33 Record id 1A8O:A, chain A 34 ['UNP:P12497', 'UNP:POL_HV1N5'] 35 36 Equivalently, 37 38 >>> with open("PDB/1A8O.pdb") as handle: 39 ... for record in PdbSeqresIterator(handle): 40 ... print("Record id %s, chain %s" % (record.id, record.annotations["chain"])) 41 ... print(record.dbxrefs) 42 ... 43 Record id 1A8O:A, chain A 44 ['UNP:P12497', 'UNP:POL_HV1N5'] 45 46 Note the chain is recorded in the annotations dictionary, and any PDB DBREF 47 lines are recorded in the database cross-references list. 48 """ 49 # Late-binding import to avoid circular dependency on SeqIO in Bio.SeqUtils 50 from Bio.SeqUtils import seq1 51 52 chains = collections.defaultdict(list) 53 metadata = collections.defaultdict(list) 54 for line in handle: 55 rec_name = line[0:6].strip() 56 if rec_name == 'SEQRES': 57 # NB: We only actually need chain ID and the residues here; 58 # commented bits are placeholders from the wwPDB spec. 59 # Serial number of the SEQRES record for the current chain. 60 # Starts at 1 and increments by one each line. 61 # Reset to 1 for each chain. 62 # ser_num = int(line[8:10]) 63 # Chain identifier. This may be any single legal character, 64 # including a blank which is used if there is only one chain. 65 chn_id = line[11] 66 # Number of residues in the chain (repeated on every record) 67 # num_res = int(line[13:17]) 68 residues = [seq1(res, custom_map=protein_letters_3to1) for res in line[19:].split()] 69 chains[chn_id].extend(residues) 70 elif rec_name == 'DBREF': 71 # ID code of this entry (PDB ID) 72 pdb_id = line[7:11] 73 # Chain identifier. 74 chn_id = line[12] 75 # Initial sequence number of the PDB sequence segment. 76 # seq_begin = int(line[14:18]) 77 # Initial insertion code of the PDB sequence segment. 78 # icode_begin = line[18] 79 # Ending sequence number of the PDB sequence segment. 80 # seq_end = int(line[20:24]) 81 # Ending insertion code of the PDB sequence segment. 82 # icode_end = line[24] 83 # Sequence database name. 84 database = line[26:32].strip() 85 # Sequence database accession code. 86 db_acc = line[33:41].strip() 87 # Sequence database identification code. 88 db_id_code = line[42:54].strip() 89 # Initial sequence number of the database seqment. 90 # db_seq_begin = int(line[55:60]) 91 # Insertion code of initial residue of the segment, if PDB is the 92 # reference. 93 # db_icode_begin = line[60] 94 # Ending sequence number of the database segment. 95 # db_seq_end = int(line[62:67]) 96 # Insertion code of the ending residue of the segment, if PDB is the 97 # reference. 98 # db_icode_end = line[67] 99 metadata[chn_id].append({'pdb_id': pdb_id, 'database': database, 100 'db_acc': db_acc, 'db_id_code': db_id_code}) 101 # ENH: 'SEQADV' 'MODRES' 102 103 for chn_id, residues in sorted(chains.items()): 104 record = SeqRecord(Seq(''.join(residues), generic_protein)) 105 record.annotations = {"chain": chn_id} 106 if chn_id in metadata: 107 m = metadata[chn_id][0] 108 record.id = record.name = "%s:%s" % (m['pdb_id'], chn_id) 109 record.description = ("%s:%s %s" % (m['database'], 110 m['db_acc'], 111 m['db_id_code'])) 112 for melem in metadata[chn_id]: 113 record.dbxrefs.extend([ 114 "%s:%s" % (melem['database'], melem['db_acc']), 115 "%s:%s" % (melem['database'], melem['db_id_code'])]) 116 else: 117 record.id = chn_id 118 yield record
119 120
121 -def PdbAtomIterator(handle):
122 """Returns SeqRecord objects for each chain in a PDB file 123 124 The sequences are derived from the 3D structure (ATOM records), not the 125 SEQRES lines in the PDB file header. 126 127 Unrecognised three letter amino acid codes (e.g. "CSD") from HETATM entries 128 are converted to "X" in the sequence. 129 130 In addition to information from the PDB header (which is the same for all 131 records), the following chain specific information is placed in the 132 annotation: 133 134 record.annotations["residues"] = List of residue ID strings 135 record.annotations["chain"] = Chain ID (typically A, B ,...) 136 record.annotations["model"] = Model ID (typically zero) 137 138 Where amino acids are missing from the structure, as indicated by residue 139 numbering, the sequence is filled in with 'X' characters to match the size 140 of the missing region, and None is included as the corresponding entry in 141 the list record.annotations["residues"]. 142 143 This function uses the Bio.PDB module to do most of the hard work. The 144 annotation information could be improved but this extra parsing should be 145 done in parse_pdb_header, not this module. 146 147 This gets called internally via Bio.SeqIO for the atom based interpretation 148 of the PDB file format: 149 150 >>> from Bio import SeqIO 151 >>> for record in SeqIO.parse("PDB/1A8O.pdb", "pdb-atom"): 152 ... print("Record id %s, chain %s" % (record.id, record.annotations["chain"])) 153 ... 154 Record id 1A8O:A, chain A 155 156 Equivalently, 157 158 >>> with open("PDB/1A8O.pdb") as handle: 159 ... for record in PdbAtomIterator(handle): 160 ... print("Record id %s, chain %s" % (record.id, record.annotations["chain"])) 161 ... 162 Record id 1A8O:A, chain A 163 164 """ 165 # TODO - Add record.annotations to the doctest, esp the residues (not working?) 166 167 # Only import PDB when needed, to avoid/delay NumPy dependency in SeqIO 168 from Bio.PDB import PDBParser 169 from Bio.SeqUtils import seq1 170 171 def restype(residue): 172 """Return a residue's type as a one-letter code. 173 174 Non-standard residues (e.g. CSD, ANP) are returned as 'X'. 175 """ 176 return seq1(residue.resname, custom_map=protein_letters_3to1)
177 178 # Deduce the PDB ID from the PDB header 179 # ENH: or filename? 180 from Bio.File import UndoHandle 181 undo_handle = UndoHandle(handle) 182 firstline = undo_handle.peekline() 183 if firstline.startswith("HEADER"): 184 pdb_id = firstline[62:66] 185 else: 186 warnings.warn("First line is not a 'HEADER'; can't determine PDB ID. " 187 "Line: %r" % firstline, BiopythonWarning) 188 pdb_id = '????' 189 190 struct = PDBParser().get_structure(pdb_id, undo_handle) 191 model = struct[0] 192 for chn_id, chain in sorted(model.child_dict.items()): 193 # HETATM mod. res. policy: remove mod if in sequence, else discard 194 residues = [res for res in chain.get_unpacked_list() 195 if seq1(res.get_resname().upper(), 196 custom_map=protein_letters_3to1) != "X"] 197 if not residues: 198 continue 199 # Identify missing residues in the structure 200 # (fill the sequence with 'X' residues in these regions) 201 gaps = [] 202 rnumbers = [r.id[1] for r in residues] 203 for i, rnum in enumerate(rnumbers[:-1]): 204 if rnumbers[i + 1] != rnum + 1: 205 # It's a gap! 206 gaps.append((i + 1, rnum, rnumbers[i + 1])) 207 if gaps: 208 res_out = [] 209 prev_idx = 0 210 for i, pregap, postgap in gaps: 211 if postgap > pregap: 212 gapsize = postgap - pregap - 1 213 res_out.extend(restype(x) for x in residues[prev_idx:i]) 214 prev_idx = i 215 res_out.append('X' * gapsize) 216 else: 217 warnings.warn("Ignoring out-of-order residues after a gap", 218 BiopythonWarning) 219 # Keep the normal part, drop the out-of-order segment 220 # (presumably modified or hetatm residues, e.g. 3BEG) 221 res_out.extend(restype(x) for x in residues[prev_idx:i]) 222 break 223 else: 224 # Last segment 225 res_out.extend(restype(x) for x in residues[prev_idx:]) 226 else: 227 # No gaps 228 res_out = [restype(x) for x in residues] 229 record_id = "%s:%s" % (pdb_id, chn_id) 230 # ENH - model number in SeqRecord id if multiple models? 231 # id = "Chain%s" % str(chain.id) 232 # if len(structure) > 1 : 233 # id = ("Model%s|" % str(model.id)) + id 234 235 record = SeqRecord(Seq(''.join(res_out), generic_protein), 236 id=record_id, 237 description=record_id, 238 ) 239 240 # The PDB header was loaded as a dictionary, so let's reuse it all 241 record.annotations = struct.header.copy() 242 # Plus some chain specifics: 243 record.annotations["model"] = model.id 244 record.annotations["chain"] = chain.id 245 246 # Start & end 247 record.annotations["start"] = int(rnumbers[0]) 248 record.annotations["end"] = int(rnumbers[-1]) 249 250 # ENH - add letter annotations -- per-residue info, e.g. numbers 251 252 yield record 253 254 255 if __name__ == '__main__': 256 from Bio._utils import run_doctest 257 run_doctest(verbose=0) 258