Package Bio :: Package SeqIO :: Module PdbIO
[hide private]
[frames] | no frames]

Source Code for Module Bio.SeqIO.PdbIO

  1  # Copyright 2012 by Eric Talevich.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5  import collections 
  6  import warnings 
  7   
  8  from Bio.Alphabet import generic_protein 
  9  from Bio.Seq import Seq 
 10  from Bio.SeqRecord import SeqRecord 
 11  from Bio.Data.SCOPData import protein_letters_3to1 
 12   
 13  __docformat__ = "restructuredtext en" 
 14   
 15   
16 -def PdbSeqresIterator(handle):
17 """Returns SeqRecord objects for each chain in a PDB file. 18 19 The sequences are derived from the SEQRES lines in the 20 PDB file header, not the atoms of the 3D structure. 21 22 Specifically, these PDB records are handled: DBREF, SEQADV, SEQRES, MODRES 23 24 See: http://www.wwpdb.org/documentation/format23/sect3.html 25 """ 26 # Late-binding import to avoid circular dependency on SeqIO in Bio.SeqUtils 27 from Bio.SeqUtils import seq1 28 29 chains = collections.defaultdict(list) 30 metadata = collections.defaultdict(list) 31 for line in handle: 32 rec_name = line[0:6].strip() 33 if rec_name == 'SEQRES': 34 # NB: We only actually need chain ID and the residues here; 35 # commented bits are placeholders from the wwPDB spec. 36 # Serial number of the SEQRES record for the current chain. 37 # Starts at 1 and increments by one each line. 38 # Reset to 1 for each chain. 39 # ser_num = int(line[8:10]) 40 # Chain identifier. This may be any single legal character, 41 # including a blank which is used if there is only one chain. 42 chn_id = line[11] 43 # Number of residues in the chain (repeated on every record) 44 # num_res = int(line[13:17]) 45 residues = [seq1(res, custom_map=protein_letters_3to1) for res in line[19:].split()] 46 chains[chn_id].extend(residues) 47 elif rec_name == 'DBREF': 48 # ID code of this entry (PDB ID) 49 pdb_id = line[7:11] 50 # Chain identifier. 51 chn_id = line[12] 52 # Initial sequence number of the PDB sequence segment. 53 # seq_begin = int(line[14:18]) 54 # Initial insertion code of the PDB sequence segment. 55 # icode_begin = line[18] 56 # Ending sequence number of the PDB sequence segment. 57 # seq_end = int(line[20:24]) 58 # Ending insertion code of the PDB sequence segment. 59 # icode_end = line[24] 60 # Sequence database name. 61 database = line[26:32].strip() 62 # Sequence database accession code. 63 db_acc = line[33:41].strip() 64 # Sequence database identification code. 65 db_id_code = line[42:54].strip() 66 # Initial sequence number of the database seqment. 67 # db_seq_begin = int(line[55:60]) 68 # Insertion code of initial residue of the segment, if PDB is the 69 # reference. 70 # db_icode_begin = line[60] 71 # Ending sequence number of the database segment. 72 # db_seq_end = int(line[62:67]) 73 # Insertion code of the ending residue of the segment, if PDB is the 74 # reference. 75 # db_icode_end = line[67] 76 metadata[chn_id].append({'pdb_id': pdb_id, 'database': database, 77 'db_acc': db_acc, 'db_id_code': db_id_code}) 78 # ENH: 'SEQADV' 'MODRES' 79 80 for chn_id, residues in sorted(chains.items()): 81 record = SeqRecord(Seq(''.join(residues), generic_protein)) 82 record.annotations = {"chain": chn_id} 83 if chn_id in metadata: 84 m = metadata[chn_id][0] 85 record.id = record.name = "%s:%s" % (m['pdb_id'], chn_id) 86 record.description = ("%s:%s %s" % (m['database'], 87 m['db_acc'], 88 m['db_id_code'])) 89 for melem in metadata[chn_id]: 90 record.dbxrefs.extend([ 91 "%s:%s" % (melem['database'], melem['db_acc']), 92 "%s:%s" % (melem['database'], melem['db_id_code'])]) 93 else: 94 record.id = chn_id 95 yield record
96 97
98 -def PdbAtomIterator(handle):
99 """Returns SeqRecord objects for each chain in a PDB file 100 101 The sequences are derived from the 3D structure (ATOM records), not the 102 SEQRES lines in the PDB file header. 103 104 Unrecognised three letter amino acid codes (e.g. "CSD") from HETATM entries 105 are converted to "X" in the sequence. 106 107 In addition to information from the PDB header (which is the same for all 108 records), the following chain specific information is placed in the 109 annotation: 110 111 record.annotations["residues"] = List of residue ID strings 112 record.annotations["chain"] = Chain ID (typically A, B ,...) 113 record.annotations["model"] = Model ID (typically zero) 114 115 Where amino acids are missing from the structure, as indicated by residue 116 numbering, the sequence is filled in with 'X' characters to match the size 117 of the missing region, and None is included as the corresponding entry in 118 the list record.annotations["residues"]. 119 120 This function uses the Bio.PDB module to do most of the hard work. The 121 annotation information could be improved but this extra parsing should be 122 done in parse_pdb_header, not this module. 123 """ 124 # Only import PDB when needed, to avoid/delay NumPy dependency in SeqIO 125 from Bio.PDB import PDBParser 126 from Bio.SeqUtils import seq1 127 128 def restype(residue): 129 """Return a residue's type as a one-letter code. 130 131 Non-standard residues (e.g. CSD, ANP) are returned as 'X'. 132 """ 133 return seq1(residue.resname, custom_map=protein_letters_3to1)
134 135 # Deduce the PDB ID from the PDB header 136 # ENH: or filename? 137 from Bio.File import UndoHandle 138 undo_handle = UndoHandle(handle) 139 firstline = undo_handle.peekline() 140 if firstline.startswith("HEADER"): 141 pdb_id = firstline[62:66] 142 else: 143 warnings.warn("First line is not a 'HEADER'; can't determine PDB ID") 144 pdb_id = '????' 145 146 struct = PDBParser().get_structure(pdb_id, undo_handle) 147 model = struct[0] 148 for chn_id, chain in sorted(model.child_dict.items()): 149 # HETATM mod. res. policy: remove mod if in sequence, else discard 150 residues = [res for res in chain.get_unpacked_list() 151 if seq1(res.get_resname().upper(), 152 custom_map=protein_letters_3to1) != "X"] 153 if not residues: 154 continue 155 # Identify missing residues in the structure 156 # (fill the sequence with 'X' residues in these regions) 157 gaps = [] 158 rnumbers = [r.id[1] for r in residues] 159 for i, rnum in enumerate(rnumbers[:-1]): 160 if rnumbers[i + 1] != rnum + 1: 161 # It's a gap! 162 gaps.append((i + 1, rnum, rnumbers[i + 1])) 163 if gaps: 164 res_out = [] 165 prev_idx = 0 166 for i, pregap, postgap in gaps: 167 if postgap > pregap: 168 gapsize = postgap - pregap - 1 169 res_out.extend(restype(x) for x in residues[prev_idx:i]) 170 prev_idx = i 171 res_out.append('X' * gapsize) 172 else: 173 warnings.warn("Ignoring out-of-order residues after a gap", 174 UserWarning) 175 # Keep the normal part, drop the out-of-order segment 176 # (presumably modified or hetatm residues, e.g. 3BEG) 177 res_out.extend(restype(x) for x in residues[prev_idx:i]) 178 break 179 else: 180 # Last segment 181 res_out.extend(restype(x) for x in residues[prev_idx:]) 182 else: 183 # No gaps 184 res_out = [restype(x) for x in residues] 185 record_id = "%s:%s" % (pdb_id, chn_id) 186 # ENH - model number in SeqRecord id if multiple models? 187 # id = "Chain%s" % str(chain.id) 188 # if len(structure) > 1 : 189 # id = ("Model%s|" % str(model.id)) + id 190 191 record = SeqRecord(Seq(''.join(res_out), generic_protein), 192 id=record_id, 193 description=record_id, 194 ) 195 196 # The PDB header was loaded as a dictionary, so let's reuse it all 197 record.annotations = struct.header.copy() 198 # Plus some chain specifics: 199 record.annotations["model"] = model.id 200 record.annotations["chain"] = chain.id 201 202 # Start & end 203 record.annotations["start"] = int(rnumbers[0]) 204 record.annotations["end"] = int(rnumbers[-1]) 205 206 # ENH - add letter annotations -- per-residue info, e.g. numbers 207 208 yield record 209 210 211 if __name__ == '__main__': 212 # Test 213 import sys 214 from Bio import SeqIO 215 for fname in sys.argv[1:]: 216 for parser in (PdbSeqresIterator, PdbAtomIterator): 217 with open(fname) as handle: 218 records = parser(handle) 219 SeqIO.write(records, sys.stdout, 'fasta') 220