Package Bio :: Package PopGen :: Package GenePop :: Module LargeFileParser
[hide private]
[frames] | no frames]

Source Code for Module Bio.PopGen.GenePop.LargeFileParser

  1  # Copyright 2010 by Tiago Antao.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5   
  6  """Large file parsing of Genepop files 
  7   
  8  The standard parser loads the whole file into memory. This parser 
  9  provides an iterator over data. 
 10   
 11  Classes: 
 12  - LargeRecord - Holds GenePop data. 
 13   
 14  Functions: 
 15  - read - Parses a GenePop record (file) into a Record object. 
 16   
 17  """ 
 18   
 19   
20 -def get_indiv(line):
21 indiv_name, marker_line = line.split(',') 22 markers = marker_line.replace('\t', ' ').split(' ') 23 markers = [marker for marker in markers if marker != ''] 24 if len(markers[0]) in [2, 4]: # 2 digits per allele 25 marker_len = 2 26 else: 27 marker_len = 3 28 try: 29 allele_list = [(int(marker[0:marker_len]), 30 int(marker[marker_len:])) 31 for marker in markers] 32 except ValueError: # Haploid 33 allele_list = [(int(marker[0:marker_len]),) 34 for marker in markers] 35 return indiv_name, allele_list, marker_len
36 37
38 -def read(handle):
39 """Parses a handle containing a GenePop file. 40 41 Arguments: 42 - handle is a file-like object that contains a GenePop record. 43 44 """ 45 record = Record(handle) 46 record.comment_line = str(handle.readline()).rstrip() 47 # We can now have one loci per line or all loci in a single line 48 # separated by either space or comma+space... 49 # We will remove all commas on loci... that should not be a problem 50 sample_loci_line = str(handle.readline()).rstrip().replace(',', '') 51 all_loci = sample_loci_line.split(' ') 52 record.loci_list.extend(all_loci) 53 line = handle.readline() 54 while line != "": 55 line = line.rstrip() 56 if line.upper() == "POP": 57 record.stack.append("POP") 58 break 59 record.loci_list.append(line) 60 line = handle.readline() 61 next_line = handle.readline().rstrip() 62 indiv_name, allele_list, record.marker_len = get_indiv(next_line) 63 record.stack.append(next_line) 64 return record
65 66
67 -class Record(object):
68 """Holds information from a GenePop record. 69 70 Members: 71 marker_len The marker length (2 or 3 digit code per allele). 72 73 comment_line Comment line. 74 75 loci_list List of loci names. 76 77 data_generator Iterates over population data. 78 79 The generator will only work once. If you want to read a handle 80 twice you have to re-open it! 81 82 data_generator can either be () - an empty tuple - marking a new 83 population or an individual. An individual is something like 84 ('Ind1', [(1,1), (3,None), (200,201)], 85 In the case above the individual is called Ind1, 86 has three diploid loci. For the second loci, one of the alleles 87 is unknown. 88 89 """ 90
91 - def __init__(self, handle):
92 self.handle = handle 93 self.marker_len = 0 94 self.comment_line = "" 95 self.loci_list = [] 96 self.populations = [] 97 self.stack = []
98
99 - def data_generator(self):
100 for handle in [self.stack, self.handle]: 101 for line in handle: 102 line = line.rstrip() 103 if line.upper() == 'POP': 104 yield () 105 else: 106 indiv_name, allele_list, marker_len = get_indiv(line) 107 clean_list = [] 108 for locus in allele_list: 109 mk_real = [] 110 for al in locus: 111 if al == 0: 112 mk_real.append(None) 113 else: 114 mk_real.append(al) 115 clean_list.append(tuple(mk_real)) 116 yield indiv_name, clean_list 117 raise StopIteration()
118