Package Bio :: Package SeqIO :: Module AbiIO
[hide private]
[frames] | no frames]

Source Code for Module Bio.SeqIO.AbiIO

  1  # Copyright 2011 by Wibowo Arindrarto (w.arindrarto@gmail.com) 
  2  # Revisions copyright 2011-2016 by Peter Cock. 
  3  # This code is part of the Biopython distribution and governed by its 
  4  # license. Please see the LICENSE file that should have been included 
  5  # as part of this package. 
  6   
  7  """Bio.SeqIO parser for the ABI format. 
  8   
  9  ABI is the format used by Applied Biosystem's sequencing machines to store 
 10  sequencing results. 
 11   
 12  For more details on the format specification, visit: 
 13  http://www.appliedbiosystem.com/support/software_community/ABIF_File_Format.pdf 
 14   
 15  """ 
 16   
 17   
 18  import datetime 
 19  import struct 
 20   
 21  from os.path import basename 
 22   
 23  from Bio import Alphabet 
 24  from Bio.Alphabet.IUPAC import ambiguous_dna, unambiguous_dna 
 25  from Bio.Seq import Seq 
 26  from Bio.SeqRecord import SeqRecord 
 27   
 28  from Bio._py3k import _bytes_to_string, _as_bytes 
 29  from Bio._py3k import zip 
 30   
 31  # dictionary for determining which tags goes into SeqRecord annotation 
 32  # each key is tag_name + tag_number 
 33  # if a tag entry needs to be added, just add its key and its key 
 34  # for the annotations dictionary as the value 
 35  # dictionary for tags that require preprocessing before use in creating 
 36  # seqrecords 
 37  _EXTRACT = { 
 38      'TUBE1': 'sample_well', 
 39      'DySN1': 'dye', 
 40      'GTyp1': 'polymer', 
 41      'MODL1': 'machine_model', 
 42  } 
 43   
 44   
 45  # Complete data structure representing 98% of the API. The general section 
 46  # represents the part of the API that's common to ALL instruments, whereas the 
 47  # instrument specific sections are labelled as they are in the ABIF spec 
 48  # 
 49  # Keys don't seem to clash from machine to machine, so when we parse, we look 
 50  # for ANY key, and store that in the raw ABIF data structure attached to the 
 51  # annotations, with the assumption that anyone parsing the data can look up the 
 52  # spec themself 
 53  # 
 54  # Key definitions are retained in case end users want "nice" labels pre-made 
 55  # for them for all of the available fields. 
 56  _INSTRUMENT_SPECIFIC_TAGS = {} 
 57   
 58  _INSTRUMENT_SPECIFIC_TAGS['general'] = { 
 59      'APFN2': 'Sequencing Analysis parameters file name', 
 60      'APXV1': 'Analysis Protocol XML schema version', 
 61      'APrN1': 'Analysis Protocol settings name', 
 62      'APrV1': 'Analysis Protocol settings version', 
 63      'APrX1': 'Analysis Protocol XML string', 
 64      'CMNT1': 'Sample Comment', 
 65      'CTID1': 'Container Identifier, a.k.a. plate barcode', 
 66      'CTNM1': 'Container name, usually identical to CTID, but not necessarily so', 
 67      'CTTL1': 'Comment Title', 
 68      'CpEP1': 'Capillary type electrophoresis. 1 for a capillary based machine. 0 for a slab gel based machine.', 
 69      'DATA1': 'Channel 1 raw data', 
 70      'DATA2': 'Channel 2 raw data', 
 71      'DATA3': 'Channel 3 raw data', 
 72      'DATA4': 'Channel 4 raw data', 
 73      'DATA5': 'Short Array holding measured volts/10 (EP voltage) during run', 
 74      'DATA6': 'Short Array holding measured milliAmps trace (EP current) during run', 
 75      'DATA7': 'Short Array holding measured milliWatts trace (Laser EP Power) during run', 
 76      'DATA8': 'Short Array holding measured oven Temperature (polymer temperature) trace during run', 
 77      'DATA9': 'Channel 9 processed data', 
 78      'DATA10': 'Channel 10 processed data', 
 79      'DATA11': 'Channel 11 processed data', 
 80      'DATA12': 'Channel 12 processed data', 
 81      # Prism 3100/3100-Avant may provide DATA105 
 82      #          3130/3130-XL may provide DATA105 
 83      # 3530/3530-XL may provide DATA105-199, 9-12, 205-299 
 84      'DSam1': 'Downsampling factor', 
 85      'DySN1': 'Dye set name', 
 86      'Dye#1': 'Number of dyes', 
 87      'DyeN1': 'Dye 1 name', 
 88      'DyeN2': 'Dye 2 name', 
 89      'DyeN3': 'Dye 3 name', 
 90      'DyeN4': 'Dye 4 name', 
 91      'DyeW1': 'Dye 1 wavelength', 
 92      'DyeW2': 'Dye 2 wavelength', 
 93      'DyeW3': 'Dye 3 wavelength', 
 94      'DyeW4': 'Dye 4 wavelength', 
 95      # 'DyeN5-N': 'Dye 5-N Name', 
 96      # 'DyeW5-N': 'Dye 5-N Wavelength', 
 97      'EPVt1': 'Electrophoresis voltage setting (volts)', 
 98      'EVNT1': 'Start Run event', 
 99      'EVNT2': 'Stop Run event', 
100      'EVNT3': 'Start Collection event', 
101      'EVNT4': 'Stop Collection event', 
102      'FWO_1': 'Base Order. Sequencing Analysis Filter wheel order. Fixed for 3500 at "GATC"', 
103      'GTyp1': 'Gel or polymer Type', 
104      'InSc1': 'Injection time (seconds)', 
105      'InVt1': 'Injection voltage (volts)', 
106      'LANE1': 'Lane/Capillary', 
107      'LIMS1': 'Sample tracking ID', 
108      'LNTD1': 'Length to detector', 
109      'LsrP1': 'Laser Power setting (micro Watts)', 
110      'MCHN1': 'Instrument name and serial number', 
111      'MODF1': 'Data collection module file', 
112      'MODL1': 'Model number', 
113      'NAVG1': 'Pixels averaged per lane', 
114      'NLNE1': 'Number of capillaries', 
115      'OfSc1': 'List of scans that are marked off scale in Collection. (optional)', 
116      # OvrI and OrvV are listed as "1-N", and "One for each dye (unanalyzed 
117      # and/or analyzed data)" 
118      'OvrI1': 'List of scan number indexes that have values greater than 32767 but did not ' 
119               'saturate the camera. In Genemapper samples, this can have indexes with ' 
120               'values greater than 32000. In sequencing samples, this cannot have ' 
121               'indexes with values greater than 32000.', 
122      'OvrI2': 'List of scan number indexes that have values greater than 32767 but did not ' 
123               'saturate the camera. In Genemapper samples, this can have indexes with ' 
124               'values greater than 32000. In sequencing samples, this cannot have ' 
125               'indexes with values greater than 32000.', 
126      'OvrI3': 'List of scan number indexes that have values greater than 32767 but did not ' 
127               'saturate the camera. In Genemapper samples, this can have indexes with ' 
128               'values greater than 32000. In sequencing samples, this cannot have ' 
129               'indexes with values greater than 32000.', 
130      'OvrI4': 'List of scan number indexes that have values greater than 32767 but did not ' 
131               'saturate the camera. In Genemapper samples, this can have indexes with ' 
132               'values greater than 32000. In sequencing samples, this cannot have ' 
133               'indexes with values greater than 32000.', 
134      'OvrV1': 'List of color data values found at the locations listed in the OvrI tag. ' 
135               'There must be exactly as many numbers in this array as in the OvrI array.', 
136      'OvrV2': 'List of color data values found at the locations listed in the OvrI tag. ' 
137               'There must be exactly as many numbers in this array as in the OvrI array.', 
138      'OvrV3': 'List of color data values found at the locations listed in the OvrI tag. ' 
139               'There must be exactly as many numbers in this array as in the OvrI array.', 
140      'OvrV4': 'List of color data values found at the locations listed in the OvrI tag. ' 
141               'There must be exactly as many numbers in this array as in the OvrI array.', 
142      'PDMF1': 'Sequencing Analysis Mobility file name chosen in collection', 
143      'RMXV1': 'Run Module XML schema version', 
144      'RMdN1': 'Run Module name (same as MODF)', 
145      'RMdX1': 'Run Module XML string', 
146      'RPrN1': 'Run Protocol name', 
147      'RPrV1': 'Run Protocol version', 
148      'RUND1': 'Run Started Date', 
149      'RUND2': 'Run Stopped Date', 
150      'RUND3': 'Data Collection Started Date', 
151      'RUND4': 'Data Collection Stopped date', 
152      'RUNT1': 'Run Started Time', 
153      'RUNT2': 'Run Stopped Time', 
154      'RUNT3': 'Data Collection Started Time', 
155      'RUNT4': 'Data Collection Stopped Time', 
156      'Rate1': 'Scanning Rate. Milliseconds per frame.', 
157      'RunN1': 'Run Name', 
158      'SCAN1': 'Number of scans', 
159      'SMED1': 'Polymer lot expiration date', 
160      'SMLt1': 'Polymer lot number', 
161      'SMPL1': 'Sample name', 
162      'SVER1': 'Data collection software version', 
163      'SVER3': 'Data collection firmware version', 
164      'Satd1': 'Array of longs representing the scan numbers of data points, which are flagged as saturated by data collection (optional)', 
165      'Scal1': 'Rescaling divisor for color data', 
166      'Scan1': 'Number of scans (legacy - use SCAN)', 
167      'TUBE1': 'Well ID', 
168      'Tmpr1': 'Run temperature setting', 
169      'User1': 'Name of user who created the plate (optional)', 
170  } 
171   
172  #  No instrument specific tags 
173  # _INSTRUMENT_SPECIFIC_TAGS['abi_prism_3100/3100-Avant'] = { 
174  # } 
175   
176  _INSTRUMENT_SPECIFIC_TAGS['abi_3130/3130xl'] = { 
177      'CTOw1': 'Container owner', 
178      'HCFG1': 'Instrument Class', 
179      'HCFG2': 'Instrument Family', 
180      'HCFG3': 'Official Instrument Name', 
181      'HCFG4': 'Instrument Parameters', 
182      'RMdVa1': 'Run Module version', 
183  } 
184   
185  _INSTRUMENT_SPECIFIC_TAGS['abi_3530/3530xl'] = { 
186      'AAct1': 'Primary Analysis Audit Active indication. True if system auditing was enabled during the last write of this file, ' 
187               'false if system auditing was disabled.', 
188      'ABED1': 'Anode buffer expiration date using ISO 8601 format using the patterns YYYY-MM-DDTHH:MM:SS.ss+/-HH:MM. Hundredths of a second are optional.', 
189      'ABID1': 'Anode buffer tray first installed date', 
190      'ABLt1': 'Anode buffer lot number', 
191      'ABRn1': 'Number of runs (injections) processed with the current Anode Buffer (runs allowed - runs remaining)', 
192      'ABTp1': 'Anode buffer type', 
193      'AEPt1': 'Analysis Ending scan number for basecalling on initial analysis', 
194      'AEPt2': 'Analysis Ending scan number for basecalling on last analysis', 
195      'APCN1': 'Amplicon name', 
196      'ARTN1': 'Analysis Return code. Produced only by 5 Prime basecaller 1.0b3', 
197      'ASPF1': 'Flag to indicate whether adaptive processing worked or not', 
198      'ASPt1': 'Analysis Starting scan number for first analysis', 
199      'ASPt2': 'Analysis Starting scan number for last analysis', 
200      'AUDT2': 'Audit log used across 3500 software (optional)', 
201      'AVld1': 'Assay validation flag (true or false)', 
202      'AmbT1': 'Record of ambient temperature readings', 
203      'AsyC1': 'The assay contents (xml format)', 
204      'AsyN1': 'The assay name', 
205      'AsyV1': 'The assay version', 
206      'B1Pt1': 'Reference scan number for mobility and spacing curves for first analysis', 
207      'B1Pt2': 'Reference scan number for mobility and spacing curves for last analysis', 
208      'BCTS1': 'Basecaller timestamp. Time of completion of most recent analysis', 
209      'BcRn1': 'Basecalling qc code', 
210      'BcRs1': 'Basecalling warnings, a concatenated comma separated string', 
211      'BcRs2': 'Basecalling errors, a concatenated comma separated string', 
212      'CAED1': 'Capillary array expiration', 
213      'CALt1': 'Capillary array lot number', 
214      'CARn1': 'Number of injections processed (including the one of which this sample was a part) through the capillary array', 
215      'CASN1': 'Capillary array serial number', 
216      'CBED1': 'Cathode buffer expiration date', 
217      'CBID1': 'Cathode buffer tray first installed date', 
218      'CBLt1': 'Cathode buffer lot number', 
219      'CBRn1': 'Number of runs (injections) processed with the current Cathode Buffer (runs allowed - runs remaining)', 
220      'CBTp1': 'Cathode buffer type', 
221      'CLRG1': 'Start of the clear range (inclusive).', 
222      'CLRG2': 'Clear range length', 
223      'CRLn1': 'Contiguous read length', 
224      'CRLn2': 'One of "Pass", "Fail", or "Check"', 
225      'CTOw1': 'The name entered as the Owner of a plate, in the plate editor', 
226      'CkSm1': 'File checksum', 
227      'DCEv1': 'A list of door-close events, separated by semicolon. Door open events are generally paired with door close events.', 
228      'DCHT1': 'Reserved for backward compatibility. The detection cell heater temperature setting from the Run Module. Not used for 3500.', 
229      'DOEv1': 'A list of door-open events, separated by semicolon. Door close events are generally paired with door open events.', 
230      'ESig2': 'Electronic signature record used across 3500 software', 
231      'FTab1': 'Feature table. Can be created by Nibbler for Clear Range.', 
232      'FVoc1': 'Feature table vocabulary. Can be created by Nibbler for Clear Range.', 
233      'Feat1': 'Features. Can be created by Nibbler for Clear Range.', 
234      'HCFG1': 'The Instrument Class. All upper case, no spaces. Initial valid value: CE', 
235      'HCFG2': 'The Instrument Family. All upper case, no spaces. Valid values: 31XX or 37XX for UDC, 35XX (for 3500)', 
236      'HCFG3': 'The official instrument name. Mixed case, minus any special formatting. Initial valid values: 3130, 3130xl, 3730, 3730xl, 3500, 3500xl.', 
237      'HCFG4': 'Instrument parameters. Contains key-value pairs of instrument configuration information, separated by semicolons. ' 
238               'Four parameters are included initially: UnitID=<UNITD number>, CPUBoard=<board type>, ' 
239               'ArraySize=<# of capillaries>, SerialNumber=<Instrument Serial#>.', 
240      'InjN1': 'Injection name', 
241      'LAST1': 'Parameter settings information', 
242      'NOIS1': 'The estimate of rms baseline noise (S/N ratio) for each dye for a successfully analyzed sample. ' 
243               'Corresponds in order to the raw data in tags DATA 1-4. KB basecaller only.', 
244      'P1AM1': 'Amplitude of primary peak, which is not necessarily equal to corresponding signal strength at that position', 
245      'P1RL1': 'Deviation of primary peak position from (PLoc,2), times 100, rounded to integer', 
246      'P1WD1': 'Full-width Half-max of primary peak, times 100, rounded to integer. ' 
247               'Corresponding signal intensity is not necessarily equal to one half of primary peak amplitude', 
248      'P2AM1': 'Amplitude of secondary peak, which is not necessarily equal to corresponding signal strength at that position', 
249      'P2BA1': 'Base of secondary peak', 
250      'P2RL1': 'Deviation of secondary peak position from (PLoc,2), times 100, rounded to integer', 
251      'PBAS1': 'Array of sequence characters edited by user', 
252      'PBAS2': 'Array of sequence characters as called by Basecaller', 
253      'PCON1': 'Array of quality Values (0-255) as edited by user', 
254      'PCON2': 'Array of quality values (0-255) as called by Basecaller', 
255      'PDMF2': 'Mobility file name chosen in most recent analysis (identical to PDMF1)', 
256      'PLOC1': 'Array of peak locations edited by user', 
257      'PLOC2': 'Array of peak locations as called by Basecaller', 
258      'PRJT1': 'SeqScape 2.0 project template name', 
259      'PROJ4': 'SeqScape 2.0 project name', 
260      'PSZE1': 'Plate size. The number of sample positions in the container. Current allowed values: 96, 384.', 
261      'PTYP1': 'Plate type. Current allowed values: 96-Well, 384-Well.', 
262      'PuSc1': 'Median pupscore', 
263      'QV201': 'QV20+ value', 
264      'QV202': 'One of "Pass", "Fail", or "Check"', 
265      'QcPa1': 'QC parameters', 
266      'QcRn1': 'Trimming and QC code', 
267      'QcRs1': 'QC warnings, a concatenated comma separated string', 
268      'QcRs2': 'QC errors, a concatenated comma separated string', 
269      'RGOw1': 'The name entered as the Owner of a Results Group, in the Results Group Editor. Implemented as the user name from the results group.', 
270      'RInj1': 'Reinjection number. The reinjection number that this sample belongs to. Not present if there was no reinjection.', 
271      'RNmF1': 'Raman normalization factor', 
272      'RevC1': 'for whether the sequence has been complemented', 
273      'RunN1': 'Run name (which, for 3500, is different from injection name)', 
274      'S/N%1': 'Signal strength for each dye', 
275      'SMID1': 'Polymer first installed date', 
276      'SMRn1': 'Number of runs (injections) processed with the current polymer (runs allowed - runs remaining)', 
277      'SPAC1': 'Average peak spacing used in last analysis', 
278      'SPAC2': 'Basecaller name - corresponds to name of bcp file.', 
279      'SPAC3': 'Average peak spacing last calculated by the Basecaller.', 
280      'SPEC1': 'Sequencing Analysis Specimen Name', 
281      'SVER2': 'Basecaller version number', 
282      'SVER4': 'Sample File Format Version String', 
283      'ScPa1': 'The parameter string of size caller', 
284      'ScSt1': 'Raw data start point. Set to 0 for 3500 data collection.', 
285      'SpeN1': 'Active spectral calibration name', 
286      'TrPa1': 'Timming parameters', 
287      'TrSc1': 'Trace score.', 
288      'TrSc2': 'One of "Pass", "Fail", or "Check"', 
289      'phAR1': 'Trace peak aria ratio', 
290      'phCH1': 'Chemistry type ("term", "prim", "unknown"), based on DYE_1 information', 
291      'phDY1': 'Dye ("big", "d-rhod", "unknown"), based on mob file information', 
292      'phQL1': 'Maximum Quality Value', 
293      'phTR1': 'Set Trim region', 
294      'phTR2': 'Trim probability', 
295  } 
296   
297  _INSTRUMENT_SPECIFIC_TAGS['abi_3730/3730xl'] = { 
298      'BufT1': 'Buffer tray heater temperature (degrees C)', 
299  } 
300   
301  # dictionary for data unpacking format 
302  _BYTEFMT = { 
303      1: 'b',     # byte 
304      2: 's',     # char 
305      3: 'H',     # word 
306      4: 'h',     # short 
307      5: 'i',     # long 
308      6: '2i',    # rational, legacy unsupported 
309      7: 'f',     # float 
310      8: 'd',     # double 
311      10: 'h2B',  # date 
312      11: '4B',   # time 
313      12: '2i2b',  # thumb 
314      13: 'B',    # bool 
315      14: '2h',   # point, legacy unsupported 
316      15: '4h',   # rect, legacy unsupported 
317      16: '2i',   # vPoint, legacy unsupported 
318      17: '4i',   # vRect, legacy unsupported 
319      18: 's',    # pString 
320      19: 's',    # cString 
321      20: '2i',   # tag, legacy unsupported 
322  } 
323  # header data structure (exluding 4 byte ABIF marker) 
324  _HEADFMT = '>H4sI2H3I' 
325  # directory data structure 
326  _DIRFMT = '>4sI2H4I' 
327   
328  __global_tag_listing = [] 
329  for tag in _INSTRUMENT_SPECIFIC_TAGS.values(): 
330      __global_tag_listing += tag.keys() 
331   
332   
333 -def AbiIterator(handle, alphabet=None, trim=False):
334 """Iterator for the Abi file format. 335 """ 336 # raise exception is alphabet is not dna 337 if alphabet is not None: 338 if isinstance(Alphabet._get_base_alphabet(alphabet), 339 Alphabet.ProteinAlphabet): 340 raise ValueError( 341 "Invalid alphabet, ABI files do not hold proteins.") 342 if isinstance(Alphabet._get_base_alphabet(alphabet), 343 Alphabet.RNAAlphabet): 344 raise ValueError("Invalid alphabet, ABI files do not hold RNA.") 345 346 # raise exception if handle mode is not 'rb' 347 if hasattr(handle, 'mode'): 348 if set('rb') != set(handle.mode.lower()): 349 raise ValueError("ABI files has to be opened in 'rb' mode.") 350 351 # check if input file is a valid Abi file 352 handle.seek(0) 353 marker = handle.read(4) 354 if not marker: 355 # handle empty file gracefully 356 raise StopIteration 357 if marker != _as_bytes('ABIF'): 358 raise IOError('File should start ABIF, not %r' % marker) 359 360 # dirty hack for handling time information 361 times = {'RUND1': '', 'RUND2': '', 'RUNT1': '', 'RUNT2': '', } 362 363 # initialize annotations 364 annot = dict(zip(_EXTRACT.values(), [None] * len(_EXTRACT))) 365 366 # parse header and extract data from directories 367 header = struct.unpack(_HEADFMT, 368 handle.read(struct.calcsize(_HEADFMT))) 369 370 raw = dict() 371 for tag_name, tag_number, tag_data in _abi_parse_header(header, handle): 372 key = tag_name + str(tag_number) 373 374 raw[key] = tag_data 375 376 # PBAS2 is base-called sequence, only available in 3530 377 if key == 'PBAS2': 378 seq = tag_data 379 ambigs = 'KYWMRS' 380 if alphabet is None: 381 if set(seq).intersection(ambigs): 382 alphabet = ambiguous_dna 383 else: 384 alphabet = unambiguous_dna 385 # PCON2 is quality values of base-called sequence 386 elif key == 'PCON2': 387 qual = [ord(val) for val in tag_data] 388 # SMPL1 is sample id entered before sequencing run 389 elif key == 'SMPL1': 390 sample_id = tag_data 391 elif key in times: 392 times[key] = tag_data 393 else: 394 if key in _EXTRACT: 395 annot[_EXTRACT[key]] = tag_data 396 397 # set time annotations 398 annot['run_start'] = '%s %s' % (times['RUND1'], times['RUNT1']) 399 annot['run_finish'] = '%s %s' % (times['RUND2'], times['RUNT2']) 400 401 # raw data (for advanced end users benefit) 402 annot['abif_raw'] = raw 403 404 # use the file name as SeqRecord.name if available 405 try: 406 file_name = basename(handle.name).replace('.ab1', '') 407 except AttributeError: 408 file_name = "" 409 record = SeqRecord(Seq(seq, alphabet), 410 id=sample_id, name=file_name, 411 description='', 412 annotations=annot, 413 letter_annotations={'phred_quality': qual}) 414 415 if not trim: 416 yield record 417 else: 418 yield _abi_trim(record)
419 420
421 -def _AbiTrimIterator(handle):
422 """Iterator for the Abi file format that yields trimmed SeqRecord objects. 423 """ 424 return AbiIterator(handle, trim=True)
425 426
427 -def _abi_parse_header(header, handle):
428 """Generator that returns directory contents. 429 """ 430 # header structure (after ABIF marker): 431 # file version, tag name, tag number, 432 # element type code, element size, number of elements 433 # data size, data offset, handle (not file handle) 434 head_elem_size = header[4] 435 head_elem_num = header[5] 436 head_offset = header[7] 437 index = 0 438 439 while index < head_elem_num: 440 start = head_offset + index * head_elem_size 441 # add directory offset to tuple 442 # to handle directories with data size <= 4 bytes 443 handle.seek(start) 444 dir_entry = struct.unpack(_DIRFMT, 445 handle.read(struct.calcsize(_DIRFMT))) + (start,) 446 index += 1 447 # only parse desired dirs 448 key = _bytes_to_string(dir_entry[0]) 449 key += str(dir_entry[1]) 450 451 tag_name = _bytes_to_string(dir_entry[0]) 452 tag_number = dir_entry[1] 453 elem_code = dir_entry[2] 454 elem_num = dir_entry[4] 455 data_size = dir_entry[5] 456 data_offset = dir_entry[6] 457 tag_offset = dir_entry[8] 458 # if data size <= 4 bytes, data is stored inside tag 459 # so offset needs to be changed 460 if data_size <= 4: 461 data_offset = tag_offset + 20 462 handle.seek(data_offset) 463 data = handle.read(data_size) 464 yield tag_name, tag_number, \ 465 _parse_tag_data(elem_code, elem_num, data)
466 467
468 -def _abi_trim(seq_record):
469 """Trims the sequence using Richard Mott's modified trimming algorithm. 470 471 Arguments: 472 - seq_record - SeqRecord object to be trimmed. 473 474 Trimmed bases are determined from their segment score, which is a 475 cumulative sum of each base's score. Base scores are calculated from 476 their quality values. 477 478 More about the trimming algorithm: 479 http://www.phrap.org/phredphrap/phred.html 480 http://www.clcbio.com/manual/genomics/Quality_abif_trimming.html 481 """ 482 483 start = False # flag for starting position of trimmed sequence 484 segment = 20 # minimum sequence length 485 trim_start = 0 # init start index 486 cutoff = 0.05 # default cutoff value for calculating base score 487 488 if len(seq_record) <= segment: 489 return seq_record 490 else: 491 # calculate base score 492 score_list = [cutoff - (10 ** (qual / -10.0)) for qual in 493 seq_record.letter_annotations['phred_quality']] 494 495 # calculate cummulative score 496 # if cummulative value < 0, set it to 0 497 # first value is set to 0, because of the assumption that 498 # the first base will always be trimmed out 499 cummul_score = [0] 500 for i in range(1, len(score_list)): 501 score = cummul_score[-1] + score_list[i] 502 if score < 0: 503 cummul_score.append(0) 504 else: 505 cummul_score.append(score) 506 if not start: 507 # trim_start = value when cummulative score is first > 0 508 trim_start = i 509 start = True 510 511 # trim_finish = index of highest cummulative score, 512 # marking the end of sequence segment with highest cummulative score 513 trim_finish = cummul_score.index(max(cummul_score)) 514 515 return seq_record[trim_start:trim_finish]
516 517
518 -def _parse_tag_data(elem_code, elem_num, raw_data):
519 """Returns single data value. 520 521 Arguments: 522 - elem_code - What kind of data 523 - elem_num - How many data points 524 - raw_data - abi file object from which the tags would be unpacked 525 """ 526 if elem_code in _BYTEFMT: 527 # because '>1s' unpack differently from '>s' 528 if elem_num == 1: 529 num = '' 530 else: 531 num = str(elem_num) 532 fmt = '>' + num + _BYTEFMT[elem_code] 533 534 assert len(raw_data) == struct.calcsize(fmt) 535 data = struct.unpack(fmt, raw_data) 536 537 # no need to use tuple if len(data) == 1 538 # also if data is date / time 539 if elem_code not in [10, 11] and len(data) == 1: 540 data = data[0] 541 542 # account for different data types 543 if elem_code == 2: 544 return _bytes_to_string(data) 545 elif elem_code == 10: 546 return str(datetime.date(*data)) 547 elif elem_code == 11: 548 return str(datetime.time(*data[:3])) 549 elif elem_code == 13: 550 return bool(data) 551 elif elem_code == 18: 552 return _bytes_to_string(data[1:]) 553 elif elem_code == 19: 554 return _bytes_to_string(data[:-1]) 555 else: 556 return data 557 else: 558 return None
559 560 if __name__ == '__main__': 561 pass 562