Package Bio :: Package SeqIO :: Module AbiIO
[hide private]
[frames] | no frames]

Source Code for Module Bio.SeqIO.AbiIO

  1  # Copyright 2011 by Wibowo Arindrarto (w.arindrarto@gmail.com) 
  2  # Revisions copyright 2011, 2014 by Peter Cock. 
  3  # This code is part of the Biopython distribution and governed by its 
  4  # license. Please see the LICENSE file that should have been included 
  5  # as part of this package. 
  6   
  7  """Bio.SeqIO parser for the ABI format. 
  8   
  9  ABI is the format used by Applied Biosystem's sequencing machines to store 
 10  sequencing results. 
 11   
 12  For more details on the format specification, visit: 
 13  http://www.appliedbiosystem.com/support/software_community/ABIF_File_Format.pdf 
 14   
 15  """ 
 16   
 17  __docformat__ = "restructuredtext en" 
 18   
 19  import datetime 
 20  import struct 
 21   
 22  from os.path import basename 
 23   
 24  from Bio import Alphabet 
 25  from Bio.Alphabet.IUPAC import ambiguous_dna, unambiguous_dna 
 26  from Bio.Seq import Seq 
 27  from Bio.SeqRecord import SeqRecord 
 28   
 29  from Bio._py3k import _bytes_to_string, _as_bytes 
 30  from Bio._py3k import zip 
 31   
 32  # dictionary for determining which tags goes into SeqRecord annotation 
 33  # each key is tag_name + tag_number 
 34  # if a tag entry needs to be added, just add its key and its key 
 35  # for the annotations dictionary as the value 
 36  # dictionary for tags that require preprocessing before use in creating 
 37  # seqrecords 
 38  _EXTRACT = { 
 39      'TUBE1': 'sample_well', 
 40      'DySN1': 'dye', 
 41      'GTyp1': 'polymer', 
 42      'MODL1': 'machine_model', 
 43  } 
 44   
 45   
 46  # Complete data structure representing 98% of the API. The general section 
 47  # represents the part of the API that's common to ALL instruments, whereas the 
 48  # instrument specific sections are labelled as they are in the ABIF spec 
 49  # 
 50  # Keys don't seem to clash from machine to machine, so when we parse, we look 
 51  # for ANY key, and store that in the raw ABIF data structure attached to the 
 52  # annotations, with the assumption that anyone parsing the data can look up the 
 53  # spec themself 
 54  # 
 55  # Key definitions are retained in case end users want "nice" labels pre-made 
 56  # for them for all of the available fields. 
 57  _INSTRUMENT_SPECIFIC_TAGS = {} 
 58   
 59  _INSTRUMENT_SPECIFIC_TAGS['general'] = { 
 60      'APFN2': 'Sequencing Analysis parameters file name', 
 61      'APXV1': 'Analysis Protocol XML schema version', 
 62      'APrN1': 'Analysis Protocol settings name', 
 63      'APrV1': 'Analysis Protocol settings version', 
 64      'APrX1': 'Analysis Protocol XML string', 
 65      'CMNT1': 'Sample Comment', 
 66      'CTID1': 'Container Identifier, a.k.a. plate barcode', 
 67      'CTNM1': 'Container name, usually identical to CTID, but not necessarily so', 
 68      'CTTL1': 'Comment Title', 
 69      'CpEP1': 'Capillary type electrophoresis. 1 for a capillary based machine. 0 for a slab gel based machine.', 
 70      'DATA1': 'Channel 1 raw data', 
 71      'DATA2': 'Channel 2 raw data', 
 72      'DATA3': 'Channel 3 raw data', 
 73      'DATA4': 'Channel 4 raw data', 
 74      'DATA5': 'Short Array holding measured volts/10 (EP voltage) during run', 
 75      'DATA6': 'Short Array holding measured milliAmps trace (EP current) during run', 
 76      'DATA7': 'Short Array holding measured milliWatts trace (Laser EP Power) during run', 
 77      'DATA8': 'Short Array holding measured oven Temperature (polymer temperature) trace during run', 
 78      'DATA9': 'Channel 9 processed data', 
 79      'DATA10': 'Channel 10 processed data', 
 80      'DATA11': 'Channel 11 processed data', 
 81      'DATA12': 'Channel 12 processed data', 
 82      # Prism 3100/3100-Avant may provide DATA105 
 83      #          3130/3130-XL may provide DATA105 
 84      # 3530/3530-XL may provide DATA105-199, 9-12, 205-299 
 85      'DSam1': 'Downsampling factor', 
 86      'DySN1': 'Dye set name', 
 87      'Dye#1': 'Number of dyes', 
 88      'DyeN1': 'Dye 1 name', 
 89      'DyeN2': 'Dye 2 name', 
 90      'DyeN3': 'Dye 3 name', 
 91      'DyeN4': 'Dye 4 name', 
 92      'DyeW1': 'Dye 1 wavelength', 
 93      'DyeW2': 'Dye 2 wavelength', 
 94      'DyeW3': 'Dye 3 wavelength', 
 95      'DyeW4': 'Dye 4 wavelength', 
 96      # 'DyeN5-N': 'Dye 5-N Name', 
 97      # 'DyeW5-N': 'Dye 5-N Wavelength', 
 98      'EPVt1': 'Electrophoresis voltage setting (volts)', 
 99      'EVNT1': 'Start Run event', 
100      'EVNT2': 'Stop Run event', 
101      'EVNT3': 'Start Collection event', 
102      'EVNT4': 'Stop Collection event', 
103      'FWO_1': 'Base Order. Sequencing Analysis Filter wheel order. Fixed for 3500 at "GATC"', 
104      'GTyp1': 'Gel or polymer Type', 
105      'InSc1': 'Injection time (seconds)', 
106      'InVt1': 'Injection voltage (volts)', 
107      'LANE1': 'Lane/Capillary', 
108      'LIMS1': 'Sample tracking ID', 
109      'LNTD1': 'Length to detector', 
110      'LsrP1': 'Laser Power setting (micro Watts)', 
111      'MCHN1': 'Instrument name and serial number', 
112      'MODF1': 'Data collection module file', 
113      'MODL1': 'Model number', 
114      'NAVG1': 'Pixels averaged per lane', 
115      'NLNE1': 'Number of capillaries', 
116      'OfSc1': 'List of scans that are marked off scale in Collection. (optional)', 
117      # OvrI and OrvV are listed as "1-N", and "One for each dye (unanalyzed 
118      # and/or analyzed data)" 
119      'OvrI1': 'List of scan number indexes that have values greater than 32767 but did not ' 
120               'saturate the camera. In Genemapper samples, this can have indexes with ' 
121               'values greater than 32000. In sequencing samples, this cannot have ' 
122               'indexes with values greater than 32000.', 
123      'OvrI2': 'List of scan number indexes that have values greater than 32767 but did not ' 
124               'saturate the camera. In Genemapper samples, this can have indexes with ' 
125               'values greater than 32000. In sequencing samples, this cannot have ' 
126               'indexes with values greater than 32000.', 
127      'OvrI3': 'List of scan number indexes that have values greater than 32767 but did not ' 
128               'saturate the camera. In Genemapper samples, this can have indexes with ' 
129               'values greater than 32000. In sequencing samples, this cannot have ' 
130               'indexes with values greater than 32000.', 
131      'OvrI4': 'List of scan number indexes that have values greater than 32767 but did not ' 
132               'saturate the camera. In Genemapper samples, this can have indexes with ' 
133               'values greater than 32000. In sequencing samples, this cannot have ' 
134               'indexes with values greater than 32000.', 
135      'OvrV1': 'List of color data values found at the locations listed in the OvrI tag. ' 
136               'There must be exactly as many numbers in this array as in the OvrI array.', 
137      'OvrV2': 'List of color data values found at the locations listed in the OvrI tag. ' 
138               'There must be exactly as many numbers in this array as in the OvrI array.', 
139      'OvrV3': 'List of color data values found at the locations listed in the OvrI tag. ' 
140               'There must be exactly as many numbers in this array as in the OvrI array.', 
141      'OvrV4': 'List of color data values found at the locations listed in the OvrI tag. ' 
142               'There must be exactly as many numbers in this array as in the OvrI array.', 
143      'PDMF1': 'Sequencing Analysis Mobility file name chosen in collection', 
144      'RMXV1': 'Run Module XML schema version', 
145      'RMdN1': 'Run Module name (same as MODF)', 
146      'RMdX1': 'Run Module XML string', 
147      'RPrN1': 'Run Protocol name', 
148      'RPrV1': 'Run Protocol version', 
149      'RUND1': 'Run Started Date', 
150      'RUND2': 'Run Stopped Date', 
151      'RUND3': 'Data Collection Started Date', 
152      'RUND4': 'Data Collection Stopped date', 
153      'RUNT1': 'Run Started Time', 
154      'RUNT2': 'Run Stopped Time', 
155      'RUNT3': 'Data Collection Started Time', 
156      'RUNT4': 'Data Collection Stopped Time', 
157      'Rate1': 'Scanning Rate. Milliseconds per frame.', 
158      'RunN1': 'Run Name', 
159      'SCAN1': 'Number of scans', 
160      'SMED1': 'Polymer lot expiration date', 
161      'SMLt1': 'Polymer lot number', 
162      'SMPL1': 'Sample name', 
163      'SVER1': 'Data collection software version', 
164      'SVER3': 'Data collection firmware version', 
165      'Satd1': 'Array of longs representing the scan numbers of data points, which are flagged as saturated by data collection (optional)', 
166      'Scal1': 'Rescaling divisor for color data', 
167      'Scan1': 'Number of scans (legacy - use SCAN)', 
168      'TUBE1': 'Well ID', 
169      'Tmpr1': 'Run temperature setting', 
170      'User1': 'Name of user who created the plate (optional)', 
171  } 
172   
173  # No instrument specific tags 
174  #_INSTRUMENT_SPECIFIC_TAGS['abi_prism_3100/3100-Avant'] = { 
175  #} 
176   
177  _INSTRUMENT_SPECIFIC_TAGS['abi_3130/3130xl'] = { 
178      'CTOw1': 'Container owner', 
179      'HCFG1': 'Instrument Class', 
180      'HCFG2': 'Instrument Family', 
181      'HCFG3': 'Official Instrument Name', 
182      'HCFG4': 'Instrument Parameters', 
183      'RMdVa1': 'Run Module version', 
184  } 
185   
186  _INSTRUMENT_SPECIFIC_TAGS['abi_3530/3530xl'] = { 
187      'AAct1': 'Primary Analysis Audit Active indication. True if system auditing was enabled during the last write of this file, ' 
188               'false if system auditing was disabled.', 
189      'ABED1': 'Anode buffer expiration date using ISO 8601 format using the patterns YYYY-MM-DDTHH:MM:SS.ss+/-HH:MM. Hundredths of a second are optional.', 
190      'ABID1': 'Anode buffer tray first installed date', 
191      'ABLt1': 'Anode buffer lot number', 
192      'ABRn1': 'Number of runs (injections) processed with the current Anode Buffer (runs allowed - runs remaining)', 
193      'ABTp1': 'Anode buffer type', 
194      'AEPt1': 'Analysis Ending scan number for basecalling on initial analysis', 
195      'AEPt2': 'Analysis Ending scan number for basecalling on last analysis', 
196      'APCN1': 'Amplicon name', 
197      'ARTN1': 'Analysis Return code. Produced only by 5 Prime basecaller 1.0b3', 
198      'ASPF1': 'Flag to indicate whether adaptive processing worked or not', 
199      'ASPt1': 'Analysis Starting scan number for first analysis', 
200      'ASPt2': 'Analysis Starting scan number for last analysis', 
201      'AUDT2': 'Audit log used across 3500 software (optional)', 
202      'AVld1': 'Assay validation flag (true or false)', 
203      'AmbT1': 'Record of ambient temperature readings', 
204      'AsyC1': 'The assay contents (xml format)', 
205      'AsyN1': 'The assay name', 
206      'AsyV1': 'The assay version', 
207      'B1Pt1': 'Reference scan number for mobility and spacing curves for first analysis', 
208      'B1Pt2': 'Reference scan number for mobility and spacing curves for last analysis', 
209      'BCTS1': 'Basecaller timestamp. Time of completion of most recent analysis', 
210      'BcRn1': 'Basecalling qc code', 
211      'BcRs1': 'Basecalling warnings, a concatenated comma separated string', 
212      'BcRs2': 'Basecalling errors, a concatenated comma separated string', 
213      'CAED1': 'Capillary array expiration', 
214      'CALt1': 'Capillary array lot number', 
215      'CARn1': 'Number of injections processed (including the one of which this sample was a part) through the capillary array', 
216      'CASN1': 'Capillary array serial number', 
217      'CBED1': 'Cathode buffer expiration date', 
218      'CBID1': 'Cathode buffer tray first installed date', 
219      'CBLt1': 'Cathode buffer lot number', 
220      'CBRn1': 'Number of runs (injections) processed with the current Cathode Buffer (runs allowed - runs remaining)', 
221      'CBTp1': 'Cathode buffer type', 
222      'CLRG1': 'Start of the clear range (inclusive).', 
223      'CLRG2': 'Clear range length', 
224      'CRLn1': 'Contiguous read length', 
225      'CRLn2': 'One of "Pass", "Fail", or "Check"', 
226      'CTOw1': 'The name entered as the Owner of a plate, in the plate editor', 
227      'CkSm1': 'File checksum', 
228      'DCEv1': 'A list of door-close events, separated by semicolon. Door open events are generally paired with door close events.', 
229      'DCHT1': 'Reserved for backward compatibility. The detection cell heater temperature setting from the Run Module. Not used for 3500.', 
230      'DOEv1': 'A list of door-open events, separated by semicolon. Door close events are generally paired with door open events.', 
231      'ESig2': 'Electronic signature record used across 3500 software', 
232      'FTab1': 'Feature table. Can be created by Nibbler for Clear Range.', 
233      'FVoc1': 'Feature table vocabulary. Can be created by Nibbler for Clear Range.', 
234      'Feat1': 'Features. Can be created by Nibbler for Clear Range.', 
235      'HCFG1': 'The Instrument Class. All upper case, no spaces. Initial valid value: CE', 
236      'HCFG2': 'The Instrument Family. All upper case, no spaces. Valid values: 31XX or 37XX for UDC, 35XX (for 3500)', 
237      'HCFG3': 'The official instrument name. Mixed case, minus any special formatting. Initial valid values: 3130, 3130xl, 3730, 3730xl, 3500, 3500xl.', 
238      'HCFG4': 'Instrument parameters. Contains key-value pairs of instrument configuration information, separated by semicolons. ' 
239               'Four parameters are included initially: UnitID=<UNITD number>, CPUBoard=<board type>, ' 
240               'ArraySize=<# of capillaries>, SerialNumber=<Instrument Serial#>.', 
241      'InjN1': 'Injection name', 
242      'LAST1': 'Parameter settings information', 
243      'NOIS1': 'The estimate of rms baseline noise (S/N ratio) for each dye for a successfully analyzed sample. ' 
244               'Corresponds in order to the raw data in tags DATA 1-4. KB basecaller only.', 
245      'P1AM1': 'Amplitude of primary peak, which is not necessarily equal to corresponding signal strength at that position', 
246      'P1RL1': 'Deviation of primary peak position from (PLoc,2), times 100, rounded to integer', 
247      'P1WD1': 'Full-width Half-max of primary peak, times 100, rounded to integer. ' 
248               'Corresponding signal intensity is not necessarily equal to one half of primary peak amplitude', 
249      'P2AM1': 'Amplitude of secondary peak, which is not necessarily equal to corresponding signal strength at that position', 
250      'P2BA1': 'Base of secondary peak', 
251      'P2RL1': 'Deviation of secondary peak position from (PLoc,2), times 100, rounded to integer', 
252      'PBAS1': 'Array of sequence characters edited by user', 
253      'PBAS2': 'Array of sequence characters as called by Basecaller', 
254      'PCON1': 'Array of quality Values (0-255) as edited by user', 
255      'PCON2': 'Array of quality values (0-255) as called by Basecaller', 
256      'PDMF2': 'Mobility file name chosen in most recent analysis (identical to PDMF1)', 
257      'PLOC1': 'Array of peak locations edited by user', 
258      'PLOC2': 'Array of peak locations as called by Basecaller', 
259      'PRJT1': 'SeqScape 2.0 project template name', 
260      'PROJ4': 'SeqScape 2.0 project name', 
261      'PSZE1': 'Plate size. The number of sample positions in the container. Current allowed values: 96, 384.', 
262      'PTYP1': 'Plate type. Current allowed values: 96-Well, 384-Well.', 
263      'PuSc1': 'Median pupscore', 
264      'QV201': 'QV20+ value', 
265      'QV202': 'One of "Pass", "Fail", or "Check"', 
266      'QcPa1': 'QC parameters', 
267      'QcRn1': 'Trimming and QC code', 
268      'QcRs1': 'QC warnings, a concatenated comma separated string', 
269      'QcRs2': 'QC errors, a concatenated comma separated string', 
270      'RGOw1': 'The name entered as the Owner of a Results Group, in the Results Group Editor. Implemented as the user name from the results group.', 
271      'RInj1': 'Reinjection number. The reinjection number that this sample belongs to. Not present if there was no reinjection.', 
272      'RNmF1': 'Raman normalization factor', 
273      'RevC1': 'for whether the sequence has been complemented', 
274      'RunN1': 'Run name (which, for 3500, is different from injection name)', 
275      'S/N%1': 'Signal strength for each dye', 
276      'SMID1': 'Polymer first installed date', 
277      'SMRn1': 'Number of runs (injections) processed with the current polymer (runs allowed - runs remaining)', 
278      'SPAC1': 'Average peak spacing used in last analysis', 
279      'SPAC2': 'Basecaller name - corresponds to name of bcp file.', 
280      'SPAC3': 'Average peak spacing last calculated by the Basecaller.', 
281      'SPEC1': 'Sequencing Analysis Specimen Name', 
282      'SVER2': 'Basecaller version number', 
283      'SVER4': 'Sample File Format Version String', 
284      'ScPa1': 'The parameter string of size caller', 
285      'ScSt1': 'Raw data start point. Set to 0 for 3500 data collection.', 
286      'SpeN1': 'Active spectral calibration name', 
287      'TrPa1': 'Timming parameters', 
288      'TrSc1': 'Trace score.', 
289      'TrSc2': 'One of "Pass", "Fail", or "Check"', 
290      'phAR1': 'Trace peak aria ratio', 
291      'phCH1': 'Chemistry type ("term", "prim", "unknown"), based on DYE_1 information', 
292      'phDY1': 'Dye ("big", "d-rhod", "unknown"), based on mob file information', 
293      'phQL1': 'Maximum Quality Value', 
294      'phTR1': 'Set Trim region', 
295      'phTR2': 'Trim probability', 
296  } 
297   
298  _INSTRUMENT_SPECIFIC_TAGS['abi_3730/3730xl'] = { 
299      'BufT1': 'Buffer tray heater temperature (degrees C)', 
300  } 
301   
302  # dictionary for data unpacking format 
303  _BYTEFMT = { 
304      1: 'b',     # byte 
305      2: 's',     # char 
306      3: 'H',     # word 
307      4: 'h',     # short 
308      5: 'i',     # long 
309      6: '2i',    # rational, legacy unsupported 
310      7: 'f',     # float 
311      8: 'd',     # double 
312      10: 'h2B',  # date 
313      11: '4B',   # time 
314      12: '2i2b',  # thumb 
315      13: 'B',    # bool 
316      14: '2h',   # point, legacy unsupported 
317      15: '4h',   # rect, legacy unsupported 
318      16: '2i',   # vPoint, legacy unsupported 
319      17: '4i',   # vRect, legacy unsupported 
320      18: 's',    # pString 
321      19: 's',    # cString 
322      20: '2i',   # tag, legacy unsupported 
323  } 
324  # header data structure (exluding 4 byte ABIF marker) 
325  _HEADFMT = '>H4sI2H3I' 
326  # directory data structure 
327  _DIRFMT = '>4sI2H4I' 
328   
329  __global_tag_listing = [] 
330  for tag in _INSTRUMENT_SPECIFIC_TAGS.values(): 
331      __global_tag_listing += tag.keys() 
332   
333   
334 -def AbiIterator(handle, alphabet=None, trim=False):
335 """Iterator for the Abi file format. 336 """ 337 # raise exception is alphabet is not dna 338 if alphabet is not None: 339 if isinstance(Alphabet._get_base_alphabet(alphabet), 340 Alphabet.ProteinAlphabet): 341 raise ValueError( 342 "Invalid alphabet, ABI files do not hold proteins.") 343 if isinstance(Alphabet._get_base_alphabet(alphabet), 344 Alphabet.RNAAlphabet): 345 raise ValueError("Invalid alphabet, ABI files do not hold RNA.") 346 347 # raise exception if handle mode is not 'rb' 348 if hasattr(handle, 'mode'): 349 if set('rb') != set(handle.mode.lower()): 350 raise ValueError("ABI files has to be opened in 'rb' mode.") 351 352 # check if input file is a valid Abi file 353 handle.seek(0) 354 marker = handle.read(4) 355 if not marker: 356 # handle empty file gracefully 357 raise StopIteration 358 if marker != _as_bytes('ABIF'): 359 raise IOError('File should start ABIF, not %r' % marker) 360 361 # dirty hack for handling time information 362 times = {'RUND1': '', 'RUND2': '', 'RUNT1': '', 'RUNT2': '', } 363 364 # initialize annotations 365 annot = dict(zip(_EXTRACT.values(), [None] * len(_EXTRACT))) 366 367 # parse header and extract data from directories 368 header = struct.unpack(_HEADFMT, 369 handle.read(struct.calcsize(_HEADFMT))) 370 371 raw = dict() 372 for tag_name, tag_number, tag_data in _abi_parse_header(header, handle): 373 key = tag_name + str(tag_number) 374 375 raw[key] = tag_data 376 377 # PBAS2 is base-called sequence, only available in 3530 378 if key == 'PBAS2': 379 seq = tag_data 380 ambigs = 'KYWMRS' 381 if alphabet is None: 382 if set(seq).intersection(ambigs): 383 alphabet = ambiguous_dna 384 else: 385 alphabet = unambiguous_dna 386 # PCON2 is quality values of base-called sequence 387 elif key == 'PCON2': 388 qual = [ord(val) for val in tag_data] 389 # SMPL1 is sample id entered before sequencing run 390 elif key == 'SMPL1': 391 sample_id = tag_data 392 elif key in times: 393 times[key] = tag_data 394 else: 395 if key in _EXTRACT: 396 annot[_EXTRACT[key]] = tag_data 397 398 # set time annotations 399 annot['run_start'] = '%s %s' % (times['RUND1'], times['RUNT1']) 400 annot['run_finish'] = '%s %s' % (times['RUND2'], times['RUNT2']) 401 402 # raw data (for advanced end users benefit) 403 annot['abif_raw'] = raw 404 405 # use the file name as SeqRecord.name if available 406 try: 407 file_name = basename(handle.name).replace('.ab1', '') 408 except: 409 file_name = "" 410 record = SeqRecord(Seq(seq, alphabet), 411 id=sample_id, name=file_name, 412 description='', 413 annotations=annot, 414 letter_annotations={'phred_quality': qual}) 415 416 if not trim: 417 yield record 418 else: 419 yield _abi_trim(record)
420 421
422 -def _AbiTrimIterator(handle):
423 """Iterator for the Abi file format that yields trimmed SeqRecord objects. 424 """ 425 return AbiIterator(handle, trim=True)
426 427
428 -def _abi_parse_header(header, handle):
429 """Generator that returns directory contents. 430 """ 431 # header structure (after ABIF marker): 432 # file version, tag name, tag number, 433 # element type code, element size, number of elements 434 # data size, data offset, handle (not file handle) 435 head_elem_size = header[4] 436 head_elem_num = header[5] 437 head_offset = header[7] 438 index = 0 439 440 while index < head_elem_num: 441 start = head_offset + index * head_elem_size 442 # add directory offset to tuple 443 # to handle directories with data size <= 4 bytes 444 handle.seek(start) 445 dir_entry = struct.unpack(_DIRFMT, 446 handle.read(struct.calcsize(_DIRFMT))) + (start,) 447 index += 1 448 # only parse desired dirs 449 key = _bytes_to_string(dir_entry[0]) 450 key += str(dir_entry[1]) 451 452 tag_name = _bytes_to_string(dir_entry[0]) 453 tag_number = dir_entry[1] 454 elem_code = dir_entry[2] 455 elem_num = dir_entry[4] 456 data_size = dir_entry[5] 457 data_offset = dir_entry[6] 458 tag_offset = dir_entry[8] 459 # if data size <= 4 bytes, data is stored inside tag 460 # so offset needs to be changed 461 if data_size <= 4: 462 data_offset = tag_offset + 20 463 handle.seek(data_offset) 464 data = handle.read(data_size) 465 yield tag_name, tag_number, \ 466 _parse_tag_data(elem_code, elem_num, data)
467 468
469 -def _abi_trim(seq_record):
470 """Trims the sequence using Richard Mott's modified trimming algorithm. 471 472 Arguments: 473 - seq_record - SeqRecord object to be trimmed. 474 475 Trimmed bases are determined from their segment score, which is a 476 cumulative sum of each base's score. Base scores are calculated from 477 their quality values. 478 479 More about the trimming algorithm: 480 http://www.phrap.org/phredphrap/phred.html 481 http://www.clcbio.com/manual/genomics/Quality_abif_trimming.html 482 """ 483 484 start = False # flag for starting position of trimmed sequence 485 segment = 20 # minimum sequence length 486 trim_start = 0 # init start index 487 cutoff = 0.05 # default cutoff value for calculating base score 488 489 if len(seq_record) <= segment: 490 return seq_record 491 else: 492 # calculate base score 493 score_list = [cutoff - (10 ** (qual / -10.0)) for qual in 494 seq_record.letter_annotations['phred_quality']] 495 496 # calculate cummulative score 497 # if cummulative value < 0, set it to 0 498 # first value is set to 0, because of the assumption that 499 # the first base will always be trimmed out 500 cummul_score = [0] 501 for i in range(1, len(score_list)): 502 score = cummul_score[-1] + score_list[i] 503 if score < 0: 504 cummul_score.append(0) 505 else: 506 cummul_score.append(score) 507 if not start: 508 # trim_start = value when cummulative score is first > 0 509 trim_start = i 510 start = True 511 512 # trim_finish = index of highest cummulative score, 513 # marking the end of sequence segment with highest cummulative score 514 trim_finish = cummul_score.index(max(cummul_score)) 515 516 return seq_record[trim_start:trim_finish]
517 518
519 -def _parse_tag_data(elem_code, elem_num, raw_data):
520 """Returns single data value. 521 522 Arguments: 523 - elem_code - What kind of data 524 - elem_num - How many data points 525 - raw_data - abi file object from which the tags would be unpacked 526 """ 527 if elem_code in _BYTEFMT: 528 # because '>1s' unpack differently from '>s' 529 if elem_num == 1: 530 num = '' 531 else: 532 num = str(elem_num) 533 fmt = '>' + num + _BYTEFMT[elem_code] 534 535 assert len(raw_data) == struct.calcsize(fmt) 536 data = struct.unpack(fmt, raw_data) 537 538 # no need to use tuple if len(data) == 1 539 # also if data is date / time 540 if elem_code not in [10, 11] and len(data) == 1: 541 data = data[0] 542 543 # account for different data types 544 if elem_code == 2: 545 return _bytes_to_string(data) 546 elif elem_code == 10: 547 return str(datetime.date(*data)) 548 elif elem_code == 11: 549 return str(datetime.time(*data[:3])) 550 elif elem_code == 13: 551 return bool(data) 552 elif elem_code == 18: 553 return _bytes_to_string(data[1:]) 554 elif elem_code == 19: 555 return _bytes_to_string(data[:-1]) 556 else: 557 return data 558 else: 559 return None
560 561 if __name__ == '__main__': 562 pass 563