Package Bio :: Package SeqIO :: Module PdbIO
[hide private]
[frames] | no frames]

Source Code for Module Bio.SeqIO.PdbIO

  1  # Copyright 2012 by Eric Talevich.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5  import collections 
  6  import warnings 
  7   
  8  from Bio.Alphabet import generic_protein 
  9  from Bio.Seq import Seq 
 10  from Bio.SeqRecord import SeqRecord 
 11  from Bio.Data.SCOPData import protein_letters_3to1 
 12   
 13   
14 -def PdbSeqresIterator(handle):
15 """Returns SeqRecord objects for each chain in a PDB file. 16 17 The sequences are derived from the SEQRES lines in the 18 PDB file header, not the atoms of the 3D structure. 19 20 Specifically, these PDB records are handled: DBREF, SEQADV, SEQRES, MODRES 21 22 See: http://www.wwpdb.org/documentation/format23/sect3.html 23 """ 24 # Late-binding import to avoid circular dependency on SeqIO in Bio.SeqUtils 25 from Bio.SeqUtils import seq1 26 27 chains = collections.defaultdict(list) 28 metadata = collections.defaultdict(list) 29 for line in handle: 30 rec_name = line[0:6].strip() 31 if rec_name == 'SEQRES': 32 # NB: We only actually need chain ID and the residues here; 33 # commented bits are placeholders from the wwPDB spec. 34 # Serial number of the SEQRES record for the current chain. 35 # Starts at 1 and increments by one each line. 36 # Reset to 1 for each chain. 37 # ser_num = int(line[8:10]) 38 # Chain identifier. This may be any single legal character, 39 # including a blank which is used if there is only one chain. 40 chn_id = line[11] 41 # Number of residues in the chain (repeated on every record) 42 # num_res = int(line[13:17]) 43 residues = [seq1(res, custom_map=protein_letters_3to1) for res in line[19:].split()] 44 chains[chn_id].extend(residues) 45 elif rec_name == 'DBREF': 46 # ID code of this entry (PDB ID) 47 pdb_id = line[7:11] 48 # Chain identifier. 49 chn_id = line[12] 50 # Initial sequence number of the PDB sequence segment. 51 # seq_begin = int(line[14:18]) 52 # Initial insertion code of the PDB sequence segment. 53 # icode_begin = line[18] 54 # Ending sequence number of the PDB sequence segment. 55 # seq_end = int(line[20:24]) 56 # Ending insertion code of the PDB sequence segment. 57 # icode_end = line[24] 58 # Sequence database name. 59 database = line[26:32].strip() 60 # Sequence database accession code. 61 db_acc = line[33:41].strip() 62 # Sequence database identification code. 63 db_id_code = line[42:54].strip() 64 # Initial sequence number of the database seqment. 65 # db_seq_begin = int(line[55:60]) 66 # Insertion code of initial residue of the segment, if PDB is the 67 # reference. 68 # db_icode_begin = line[60] 69 # Ending sequence number of the database segment. 70 # db_seq_end = int(line[62:67]) 71 # Insertion code of the ending residue of the segment, if PDB is the 72 # reference. 73 # db_icode_end = line[67] 74 metadata[chn_id].append({'pdb_id': pdb_id, 'database': database, 75 'db_acc': db_acc, 'db_id_code': db_id_code}) 76 # ENH: 'SEQADV' 'MODRES' 77 78 for chn_id, residues in sorted(chains.items()): 79 record = SeqRecord(Seq(''.join(residues), generic_protein)) 80 record.annotations = {"chain": chn_id} 81 if chn_id in metadata: 82 m = metadata[chn_id][0] 83 record.id = record.name = "%s:%s" % (m['pdb_id'], chn_id) 84 record.description = ("%s:%s %s" % (m['database'], 85 m['db_acc'], 86 m['db_id_code'])) 87 for melem in metadata[chn_id]: 88 record.dbxrefs.extend([ 89 "%s:%s" % (melem['database'], melem['db_acc']), 90 "%s:%s" % (melem['database'], melem['db_id_code'])]) 91 else: 92 record.id = chn_id 93 yield record
94 95
96 -def PdbAtomIterator(handle):
97 """Returns SeqRecord objects for each chain in a PDB file 98 99 The sequences are derived from the 3D structure (ATOM records), not the 100 SEQRES lines in the PDB file header. 101 102 Unrecognised three letter amino acid codes (e.g. "CSD") from HETATM entries 103 are converted to "X" in the sequence. 104 105 In addition to information from the PDB header (which is the same for all 106 records), the following chain specific information is placed in the 107 annotation: 108 109 record.annotations["residues"] = List of residue ID strings 110 record.annotations["chain"] = Chain ID (typically A, B ,...) 111 record.annotations["model"] = Model ID (typically zero) 112 113 Where amino acids are missing from the structure, as indicated by residue 114 numbering, the sequence is filled in with 'X' characters to match the size 115 of the missing region, and None is included as the corresponding entry in 116 the list record.annotations["residues"]. 117 118 This function uses the Bio.PDB module to do most of the hard work. The 119 annotation information could be improved but this extra parsing should be 120 done in parse_pdb_header, not this module. 121 """ 122 # Only import PDB when needed, to avoid/delay NumPy dependency in SeqIO 123 from Bio.PDB import PDBParser 124 from Bio.SeqUtils import seq1 125 126 def restype(residue): 127 """Return a residue's type as a one-letter code. 128 129 Non-standard residues (e.g. CSD, ANP) are returned as 'X'. 130 """ 131 return seq1(residue.resname, custom_map=protein_letters_3to1)
132 133 # Deduce the PDB ID from the PDB header 134 # ENH: or filename? 135 from Bio.File import UndoHandle 136 undo_handle = UndoHandle(handle) 137 firstline = undo_handle.peekline() 138 if firstline.startswith("HEADER"): 139 pdb_id = firstline[62:66] 140 else: 141 warnings.warn("First line is not a 'HEADER'; can't determine PDB ID") 142 pdb_id = '????' 143 144 struct = PDBParser().get_structure(pdb_id, undo_handle) 145 model = struct[0] 146 for chn_id, chain in sorted(model.child_dict.items()): 147 # HETATM mod. res. policy: remove mod if in sequence, else discard 148 residues = [res for res in chain.get_unpacked_list() 149 if seq1(res.get_resname().upper(), 150 custom_map=protein_letters_3to1) != "X"] 151 if not residues: 152 continue 153 # Identify missing residues in the structure 154 # (fill the sequence with 'X' residues in these regions) 155 gaps = [] 156 rnumbers = [r.id[1] for r in residues] 157 for i, rnum in enumerate(rnumbers[:-1]): 158 if rnumbers[i+1] != rnum + 1: 159 # It's a gap! 160 gaps.append((i+1, rnum, rnumbers[i+1])) 161 if gaps: 162 res_out = [] 163 prev_idx = 0 164 for i, pregap, postgap in gaps: 165 if postgap > pregap: 166 gapsize = postgap - pregap - 1 167 res_out.extend(restype(x) for x in residues[prev_idx:i]) 168 prev_idx = i 169 res_out.append('X'*gapsize) 170 else: 171 warnings.warn("Ignoring out-of-order residues after a gap", 172 UserWarning) 173 # Keep the normal part, drop the out-of-order segment 174 # (presumably modified or hetatm residues, e.g. 3BEG) 175 res_out.extend(restype(x) for x in residues[prev_idx:i]) 176 break 177 else: 178 # Last segment 179 res_out.extend(restype(x) for x in residues[prev_idx:]) 180 else: 181 # No gaps 182 res_out = [restype(x) for x in residues] 183 record_id = "%s:%s" % (pdb_id, chn_id) 184 # ENH - model number in SeqRecord id if multiple models? 185 # id = "Chain%s" % str(chain.id) 186 # if len(structure) > 1 : 187 # id = ("Model%s|" % str(model.id)) + id 188 189 record = SeqRecord(Seq(''.join(res_out), generic_protein), 190 id=record_id, 191 description=record_id, 192 ) 193 194 # The PDB header was loaded as a dictionary, so let's reuse it all 195 record.annotations = struct.header.copy() 196 # Plus some chain specifics: 197 record.annotations["model"] = model.id 198 record.annotations["chain"] = chain.id 199 200 # Start & end 201 record.annotations["start"] = int(rnumbers[0]) 202 record.annotations["end"] = int(rnumbers[-1]) 203 204 # ENH - add letter annotations -- per-residue info, e.g. numbers 205 206 yield record 207 208 209 if __name__ == '__main__': 210 # Test 211 import sys 212 from Bio import SeqIO 213 for fname in sys.argv[1:]: 214 for parser in (PdbSeqresIterator, PdbAtomIterator): 215 with open(fname) as handle: 216 records = parser(handle) 217 SeqIO.write(records, sys.stdout, 'fasta') 218