Package Bio :: Package SeqIO :: Module PirIO
[hide private]
[frames] | no frames]

Source Code for Module Bio.SeqIO.PirIO

  1  # Copyright 2008-2015 by Peter Cock.  All rights reserved. 
  2  # 
  3  # This file is part of the Biopython distribution and governed by your 
  4  # choice of the "Biopython License Agreement" or the "BSD 3-Clause License". 
  5  # Please see the LICENSE file that should have been included as part of this 
  6  # package. 
  7  """Bio.SeqIO support for the "pir" (aka PIR or NBRF) file format. 
  8   
  9  This module is for reading and writing PIR or NBRF format files as 
 10  SeqRecord objects. 
 11   
 12  You are expected to use this module via the Bio.SeqIO functions, or if 
 13  the file contains a sequence alignment, optionally via Bio.AlignIO instead. 
 14   
 15  This format was introduced for the Protein Information Resource (PIR), a 
 16  project of the National Biomedical Research Foundation (NBRF).  The PIR 
 17  database itself is now part of UniProt. 
 18   
 19  The file format is described online at: 
 20  http://www.ebi.ac.uk/help/pir_frame.html 
 21  http://www.cmbi.kun.nl/bioinf/tools/crab_pir.html (currently down) 
 22   
 23  An example file in this format would be:: 
 24   
 25    >P1;CRAB_ANAPL 
 26    ALPHA CRYSTALLIN B CHAIN (ALPHA(B)-CRYSTALLIN). 
 27      MDITIHNPLI RRPLFSWLAP SRIFDQIFGE HLQESELLPA SPSLSPFLMR 
 28      SPIFRMPSWL ETGLSEMRLE KDKFSVNLDV KHFSPEELKV KVLGDMVEIH 
 29      GKHEERQDEH GFIAREFNRK YRIPADVDPL TITSSLSLDG VLTVSAPRKQ 
 30      SDVPERSIPI TREEKPAIAG AQRK* 
 31   
 32    >P1;CRAB_BOVIN 
 33    ALPHA CRYSTALLIN B CHAIN (ALPHA(B)-CRYSTALLIN). 
 34      MDIAIHHPWI RRPFFPFHSP SRLFDQFFGE HLLESDLFPA STSLSPFYLR 
 35      PPSFLRAPSW IDTGLSEMRL EKDRFSVNLD VKHFSPEELK VKVLGDVIEV 
 36      HGKHEERQDE HGFISREFHR KYRIPADVDP LAITSSLSSD GVLTVNGPRK 
 37      QASGPERTIP ITREEKPAVT AAPKK* 
 38   
 39  Or, an example of a multiple sequence alignment:: 
 40   
 41    >P1;S27231 
 42    rhodopsin - northern leopard frog 
 43    MNGTEGPNFY IPMSNKTGVV RSPFDYPQYY LAEPWKYSVL AAYMFLLILL GLPINFMTLY 
 44    VTIQHKKLRT PLNYILLNLG VCNHFMVLCG FTITMYTSLH GYFVFGQTGC YFEGFFATLG 
 45    GEIALWSLVV LAIERYIVVC KPMSNFRFGE NHAMMGVAFT WIMALACAVP PLFGWSRYIP 
 46    EGMQCSCGVD YYTLKPEVNN ESFVIYMFVV HFLIPLIIIS FCYGRLVCTV KEAAAQQQES 
 47    ATTQKAEKEV TRMVIIMVIF FLICWVPYAY VAFYIFTHQG SEFGPIFMTV PAFFAKSSAI 
 48    YNPVIYIMLN KQFRNCMITT LCCGKNPFGD DDASSAATSK TEATSVSTSQ VSPA* 
 49   
 50    >P1;I51200 
 51    rhodopsin - African clawed frog 
 52    MNGTEGPNFY VPMSNKTGVV RSPFDYPQYY LAEPWQYSAL AAYMFLLILL GLPINFMTLF 
 53    VTIQHKKLRT PLNYILLNLV FANHFMVLCG FTVTMYTSMH GYFIFGPTGC YIEGFFATLG 
 54    GEVALWSLVV LAVERYIVVC KPMANFRFGE NHAIMGVAFT WIMALSCAAP PLFGWSRYIP 
 55    EGMQCSCGVD YYTLKPEVNN ESFVIYMFIV HFTIPLIVIF FCYGRLLCTV KEAAAQQQES 
 56    LTTQKAEKEV TRMVVIMVVF FLICWVPYAY VAFYIFTHQG SNFGPVFMTV PAFFAKSSAI 
 57    YNPVIYIVLN KQFRNCLITT LCCGKNPFGD EDGSSAATSK TEASSVSSSQ VSPA* 
 58   
 59    >P1;JN0120 
 60    rhodopsin - Japanese lamprey 
 61    MNGTEGDNFY VPFSNKTGLA RSPYEYPQYY LAEPWKYSAL AAYMFFLILV GFPVNFLTLF 
 62    VTVQHKKLRT PLNYILLNLA MANLFMVLFG FTVTMYTSMN GYFVFGPTMC SIEGFFATLG 
 63    GEVALWSLVV LAIERYIVIC KPMGNFRFGN THAIMGVAFT WIMALACAAP PLVGWSRYIP 
 64    EGMQCSCGPD YYTLNPNFNN ESYVVYMFVV HFLVPFVIIF FCYGRLLCTV KEAAAAQQES 
 65    ASTQKAEKEV TRMVVLMVIG FLVCWVPYAS VAFYIFTHQG SDFGATFMTL PAFFAKSSAL 
 66    YNPVIYILMN KQFRNCMITT LCCGKNPLGD DE-SGASTSKT EVSSVSTSPV SPA* 
 67   
 68   
 69  As with the FASTA format, each record starts with a line beginning with ">" 
 70  character.  There is then a two letter sequence type (P1, F1, DL, DC, RL, 
 71  RC, or XX), a semi colon, and the identification code.  The second like is 
 72  free text description.  The remaining lines contain the sequence itself, 
 73  terminating in an asterisk.  Space separated blocks of ten letters as shown 
 74  above are typical. 
 75   
 76  Sequence codes and their meanings: 
 77   - P1 - Protein (complete) 
 78   - F1 - Protein (fragment) 
 79   - D1 - DNA (e.g. EMBOSS seqret output) 
 80   - DL - DNA (linear) 
 81   - DC - DNA (circular) 
 82   - RL - RNA (linear) 
 83   - RC - RNA (circular) 
 84   - N3 - tRNA 
 85   - N1 - Other functional RNA 
 86   - XX - Unknown 
 87   
 88  """ 
 89   
 90  from __future__ import print_function 
 91   
 92  from Bio.Alphabet import single_letter_alphabet, generic_protein, \ 
 93      generic_dna, generic_rna 
 94  from Bio.Seq import Seq 
 95  from Bio.SeqRecord import SeqRecord 
 96  from Bio.SeqIO.Interfaces import SequentialSequenceWriter 
 97   
 98   
 99  _pir_alphabets = {"P1": generic_protein, 
100                    "F1": generic_protein, 
101                    "D1": generic_dna, 
102                    "DL": generic_dna, 
103                    "DC": generic_dna, 
104                    "RL": generic_rna, 
105                    "RC": generic_rna, 
106                    "N3": generic_rna, 
107                    "XX": single_letter_alphabet, 
108                    } 
109   
110   
111 -def PirIterator(handle):
112 """Iterate over Fasta records as SeqRecord objects. 113 114 handle - input file 115 alphabet - optional alphabet 116 title2ids - A function that, when given the title of the FASTA 117 file (without the beginning >), will return the id, name and 118 description (in that order) for the record as a tuple of strings. 119 120 If this is not given, then the entire title line will be used 121 as the description, and the first word as the id and name. 122 123 Note that use of title2ids matches that of Bio.Fasta.SequenceParser 124 but the defaults are slightly different. 125 126 Examples 127 -------- 128 >>> with open("NBRF/DMB_prot.pir") as handle: 129 ... for record in PirIterator(handle): 130 ... print("%s length %i" % (record.id, len(record))) 131 HLA:HLA00489 length 263 132 HLA:HLA00490 length 94 133 HLA:HLA00491 length 94 134 HLA:HLA00492 length 80 135 HLA:HLA00493 length 175 136 HLA:HLA01083 length 188 137 138 """ 139 # Skip any text before the first record (e.g. blank lines, comments) 140 while True: 141 line = handle.readline() 142 if line == "": 143 return # Premature end of file, or just empty? 144 if line[0] == ">": 145 break 146 147 while True: 148 if line[0] != ">": 149 raise ValueError( 150 "Records in PIR files should start with '>' character") 151 pir_type = line[1:3] 152 if pir_type not in _pir_alphabets or line[3] != ";": 153 raise ValueError( 154 "Records should start with '>XX;' " 155 "where XX is a valid sequence type") 156 identifier = line[4:].strip() 157 description = handle.readline().strip() 158 159 lines = [] 160 line = handle.readline() 161 while True: 162 if not line: 163 break 164 if line[0] == ">": 165 break 166 # Remove trailing whitespace, and any internal spaces 167 lines.append(line.rstrip().replace(" ", "")) 168 line = handle.readline() 169 seq = "".join(lines) 170 if seq[-1] != "*": 171 # Note the * terminator is present on nucleotide sequences too, 172 # it is not a stop codon! 173 raise ValueError( 174 "Sequences in PIR files should include a * terminator!") 175 176 # Return the record and then continue... 177 record = SeqRecord(Seq(seq[:-1], _pir_alphabets[pir_type]), 178 id=identifier, name=identifier, 179 description=description) 180 record.annotations["PIR-type"] = pir_type 181 yield record 182 183 if not line: 184 return # StopIteration 185 assert False, "Should not reach this line"
186 187
188 -class PirWriter(SequentialSequenceWriter):
189 """Class to write PIR format files.""" 190
191 - def __init__(self, handle, wrap=60, record2title=None, code=None):
192 """Create a PIR writer. 193 194 Arguments: 195 - handle - Handle to an output file, e.g. as returned 196 by open(filename, "w") 197 - wrap - Optional line length used to wrap sequence lines. 198 Defaults to wrapping the sequence at 60 characters 199 Use zero (or None) for no wrapping, giving a single 200 long line for the sequence. 201 - record2title - Optional function to return the text to be 202 used for the title line of each record. By default 203 a combination of the record.id, record.name and 204 record.description is used. 205 - code - Optional sequence code must be one of P1, F1, 206 D1, DL, DC, RL, RC, N3 and XX. By default None is used, 207 which means auto detection based on record alphabet. 208 209 You can either use:: 210 211 handle = open(filename, "w") 212 writer = PirWriter(handle) 213 writer.write_file(myRecords) 214 handle.close() 215 216 Or, follow the sequential file writer system, for example:: 217 218 handle = open(filename, "w") 219 writer = PirWriter(handle) 220 writer.write_header() # does nothing for PIR files 221 ... 222 Multiple writer.write_record() and/or writer.write_records() calls 223 ... 224 writer.write_footer() # does nothing for PIR files 225 handle.close() 226 227 """ 228 SequentialSequenceWriter.__init__(self, handle) 229 self.wrap = None 230 if wrap: 231 if wrap < 1: 232 raise ValueError 233 self.wrap = wrap 234 self.record2title = record2title 235 self.code = code
236
237 - def write_record(self, record):
238 """Write a single PIR record to the file.""" 239 assert self._header_written 240 assert not self._footer_written 241 self._record_written = True 242 243 if self.record2title: 244 title = self.clean(self.record2title(record)) 245 else: 246 title = self.clean(record.id) 247 248 if record.name and record.description: 249 description = self.clean(record.name + " - " + record.description) 250 elif record.name and not record.description: 251 description = self.clean(record.name) 252 else: 253 description = self.clean(record.description) 254 255 if self.code: 256 code = self.code 257 else: 258 if isinstance(record.seq.alphabet, type(generic_protein)): 259 code = "P1" 260 elif isinstance(record.seq.alphabet, type(generic_dna)): 261 code = "D1" 262 elif isinstance(record.seq.alphabet, type(generic_rna)): 263 code = "RL" 264 else: 265 code = "XX" 266 267 assert code in _pir_alphabets, "Sequence code must be one of " + \ 268 _pir_alphabets.keys() + "." 269 assert "\n" not in title 270 assert "\r" not in description 271 272 self.handle.write(">%s;%s\n%s\n" % (code, title, description)) 273 274 data = self._get_seq_string(record) # Catches sequence being None 275 276 assert "\n" not in data 277 assert "\r" not in data 278 279 if self.wrap: 280 line = "" 281 for i in range(0, len(data), self.wrap): 282 line += data[i:i + self.wrap] + "\n" 283 line = line[:-1] + "*\n" 284 self.handle.write(line) 285 else: 286 self.handle.write(data + "*\n")
287 288 289 if __name__ == "__main__": 290 from Bio._utils import run_doctest 291 run_doctest(verbose=0) 292