Package Bio :: Package FSSP
[hide private]
[frames] | no frames]

Source Code for Package Bio.FSSP

  1  # This code is part of the Biopython distribution and governed by its 
  2  # license.  Please see the LICENSE file that should have been included 
  3  # as part of this package. 
  4  # 
  5   
  6  """Parser for FSSP files, used in a database of protein fold classifications. 
  7   
  8  This is a module to handle FSSP files. For now it parses only the header, 
  9  summary and alignment sections. 
 10   
 11  See: Holm and Sander (1996) The FSSP database: fold classification based on 
 12  structure-structure alignment of proteins. 
 13   
 14  functions: read_fssp(file_handle): reads an fssp file into the records. Returns a 
 15  tuple of two instances. 
 16  mult_align: returns a Biopython alignment object 
 17  """ 
 18  from __future__ import print_function 
 19   
 20  import re 
 21  from . import fssp_rec 
 22  from Bio.Align import Generic 
 23  from Bio import Alphabet 
 24   
 25   
 26  fff_rec = fssp_rec.fff_rec 
 27  header_records = { 
 28      'database': re.compile('^DATABASE'), 
 29      'pdbid': re.compile('^PDBID'), 
 30      'header': re.compile('^HEADER'), 
 31      'compnd': re.compile('^COMPND'), 
 32      'author': re.compile('^AUTHOR'), 
 33      'source': re.compile('^SOURCE'), 
 34      'seqlength': re.compile('^SEQLENGTH'), 
 35      'nalign': re.compile('^NALIGN') 
 36  } 
 37   
 38  summary_title = re.compile('## +SUMMARY') 
 39  summary_rec = re.compile(' *[0-9]+: +[1-9][0-9a-z]{3,3}') 
 40  alignments_title = re.compile('## +ALIGNMENTS') 
 41  alignments_rec = re.compile(' *[0-9]+ +-{0,1}[0-9]+') 
 42  equiv_title = re.compile('## +EQUIVALENCES') 
 43   
 44   
45 -class FSSPHeader(object):
46 - def __init__(self):
47 self.database = None 48 self.pdbid = '' 49 self.header = '' 50 self.compnd = '' 51 self.source = '' 52 self.author = [] 53 self.seqlength = 0 54 self.nalign = 0
55
56 - def fill_header(self, inline):
57 for i in header_records: 58 if header_records[i].match(inline): 59 if i == 'database' or i == 'seqlength' or i == 'nalign': 60 setattr(self, i, int(inline.split()[1])) 61 elif i == 'compnd' or i == 'author': 62 setattr(self, i, inline.split()[1:]) 63 elif i == 'source' or i == 'header': 64 attr = inline[inline.find(' ') + 1:].strip() 65 setattr(self, i, attr) 66 else: 67 setattr(self, i, inline.split()[1])
68 69
70 -class PosAlign(object):
71 - def __init__(self, inStr):
72 inStr = inStr.strip() 73 if len(inStr) != 1 and len(inStr) != 2: 74 raise ValueError('PosAlign: length not 2 chars' + inStr) 75 if inStr == '..': 76 self.aa = '-' 77 self.gap = 1 78 else: 79 self.gap = 0 80 self.aa = inStr[0] 81 if self.aa == self.aa.lower(): 82 self.aa = 'C' 83 if len(inStr) == 2: 84 self.ss = inStr[1].upper() 85 else: 86 self.ss = '0'
87
88 - def __repr__(self):
89 if self.gap: 90 outstring = '..' 91 else: 92 outstring = self.aa + self.ss.lower() 93 return outstring
94 95 __str__ = __repr__
96 97
98 -class FSSPSumRec(object):
99 """ Contains info from an FSSP summary record"""
100 - def __init__(self, in_str):
101 self.raw = in_str 102 in_rec = in_str.strip().split() 103 # print(in_rec) 104 self.nr = int(in_rec[0][:-1]) 105 self.pdb1 = in_rec[1][:4] 106 if len(in_rec[1]) == 4: 107 self.chain1 = '0' 108 elif len(in_rec[1]) == 5: 109 self.chain1 = in_rec[1][4] 110 else: 111 raise ValueError('Bad PDB ID 1') 112 self.pdb2 = in_rec[2][:4] 113 if len(in_rec[2]) == 4: 114 self.chain2 = '0' 115 elif len(in_rec[2]) == 5: 116 self.chain2 = in_rec[2][4] 117 else: 118 raise ValueError('Bad PDB ID 2') 119 self.zscore = float(in_rec[3]) 120 self.rmsd = float(in_rec[4]) 121 self.lali = float(in_rec[5]) 122 self.lseq2 = float(in_rec[6]) 123 self.pID = float(in_rec[7]) 124 self.revers = int(in_rec[8]) 125 self.permut = int(in_rec[9]) 126 self.nfrag = int(in_rec[10]) 127 self.topo = in_rec[11] 128 self.doc = '' 129 for i in in_rec[12:]: 130 self.doc = self.doc + i + ' ' 131 self.doc = self.doc.rstrip() + '\n'
132
133 - def __repr__(self):
134 return self.raw
135 __str__ = __repr__
136 137
138 -class FSSPAlignRec(object):
139 - def __init__(self, in_fff_rec):
140 # print(in_fff_rec) 141 self.abs_res_num = int(in_fff_rec[fssp_rec.align.abs_res_num]) 142 self.pdb_res_num = in_fff_rec[fssp_rec.align.pdb_res_num].strip() 143 self.chain_id = in_fff_rec[fssp_rec.align.chain_id] 144 if self.chain_id == ' ': 145 self.chain_id = '0' 146 self.res_name = in_fff_rec[fssp_rec.align.res_name] 147 if self.res_name == self.res_name.lower(): 148 self.res_name = 'C' 149 self.ss1 = in_fff_rec[fssp_rec.align.ss1] 150 self.turn3 = in_fff_rec[fssp_rec.align.turn3] 151 self.turn4 = in_fff_rec[fssp_rec.align.turn4] 152 self.turn5 = in_fff_rec[fssp_rec.align.turn5] 153 self.pos_align_dict = {} 154 self.PosAlignList = []
155
156 - def add_align_list(self, align_list):
157 for i in align_list: 158 self.PosAlignList.append(PosAlign(i))
159
160 - def pos_align_list2dict(self):
161 j = 1 162 for i in self.PosAlignList: 163 self.pos_align_dict[j] = i 164 j = j + 1
165 166
167 -class FSSPAlignDict(dict):
168 - def __init__(self):
169 # The following two dictionaries are pointers to records in self 170 # The first dictionary is a "pdb_residue_number: self_key" 171 # The second dictionary is a "absolute_residue_number: self_key" 172 self.pdb_res_dict = {} 173 self.abs_res_dict = {} 174 self.data = {}
175
176 - def build_resnum_list(self):
177 for i in self: 178 self.abs_res_dict[self[i].abs_res_num] = i 179 self.pdb_res_dict[self[i].pdb_res_num] = i
180 181 # Given an absolute residue number & chain, returns the relevant fssp 182 # record
183 - def abs(self, num):
184 return self[self.abs_res_dict[num]]
185 186 # Given an PDB residue number & chain, returns the relevant fssp 187 # record
188 - def pdb(self, num):
189 return self[self.pdb_res_dict[num]]
190 191 # Returns a sequence string
192 - def sequence(self, num):
193 s = '' 194 for i in sorted(self.abs_res_dict): 195 s += self.abs(i).pos_align_dict[num].aa 196 return s
197
198 - def fasta_mult_align(self):
199 mult_align_dict = {} 200 for j in self.abs(1).pos_align_dict: 201 mult_align_dict[j] = '' 202 for fssp_rec in self.values(): 203 for j in fssp_rec.pos_align_dict: 204 mult_align_dict[j] += fssp_rec.pos_align_dict[j].aa 205 out_str = '' 206 for i in sorted(mult_align_dict): 207 out_str += '> %d\n' % i 208 k = 0 209 for j in mult_align_dict[i]: 210 k += 1 211 if k % 72 == 0: 212 out_str += '\n' 213 out_str += j 214 out_str += '\n' 215 return out_str
216 217
218 -class FSSPSumDict(dict):
219 pass
220 221 222 # 223 # Process a fssp file into its constituents. Return a 2-tuple containing 224 # a list of FSSPSumRecs and a dictionary of alignment records. 225 #
226 -def read_fssp(fssp_handle):
227 header = FSSPHeader() 228 sum_dict = FSSPSumDict() 229 align_dict = FSSPAlignDict() 230 curline = fssp_handle.readline() 231 while not summary_title.match(curline): 232 # Still in title 233 header.fill_header(curline) 234 curline = fssp_handle.readline() 235 236 if not summary_title.match(curline): 237 raise ValueError('Bad FSSP file: no summary record found') 238 curline = fssp_handle.readline() # Read the title line, discard 239 curline = fssp_handle.readline() # Read the next line 240 # Process the summary records into a list 241 while summary_rec.match(curline): 242 cur_sum_rec = FSSPSumRec(curline) 243 sum_dict[cur_sum_rec.nr] = cur_sum_rec 244 curline = fssp_handle.readline() 245 246 # Outer loop: process everything up to the EQUIVALENCES title record 247 while not equiv_title.match(curline): 248 while (not alignments_title.match(curline) and 249 not equiv_title.match(curline)): 250 curline = fssp_handle.readline() 251 if not alignments_title.match(curline): 252 if equiv_title.match(curline): 253 # print("Reached equiv_title") 254 break 255 else: 256 raise ValueError('Bad FSSP file: no alignments title record found') 257 258 if equiv_title.match(curline): 259 break 260 # If we got to this point, this means that we have matched an 261 # alignments title. Parse the alignment records in a loop. 262 curline = fssp_handle.readline() # Read the title line, discard 263 curline = fssp_handle.readline() # Read the next line 264 while alignments_rec.match(curline): 265 align_rec = FSSPAlignRec(fff_rec(curline)) 266 key = align_rec.chain_id + align_rec.res_name + str(align_rec.pdb_res_num) 267 align_list = curline[fssp_rec.align.start_aa_list:].strip().split() 268 if key not in align_dict: 269 align_dict[key] = align_rec 270 align_dict[key].add_align_list(align_list) 271 curline = fssp_handle.readline() 272 if not curline: 273 print('EOFEOFEOF') 274 raise EOFError 275 for i in align_dict.values(): 276 i.pos_align_list2dict() 277 del i.PosAlignList 278 align_dict.build_resnum_list() 279 return (header, sum_dict, align_dict)
280