Package Bio :: Package PopGen :: Package GenePop :: Module FileParser
[hide private]
[frames] | no frames]

Source Code for Module Bio.PopGen.GenePop.FileParser

  1  # Copyright 2010 by Tiago Antao.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5   
  6  """This class provides code to parse BIG GenePop files. 
  7   
  8  The difference between this class and the standard Bio.PopGen.GenePop.Record 
  9  class is that this one does not read the whole file to memory. 
 10  It provides an iterator interface, slower but consuming much mess memory. 
 11  Should be used with big files (Thousands of markers and individuals). 
 12   
 13  See http://wbiomed.curtin.edu.au/genepop/ , the format is documented 
 14  here: http://wbiomed.curtin.edu.au/genepop/help_input.html . 
 15   
 16  Classes: 
 17  FileRecord           Holds GenePop data. 
 18   
 19  Functions: 
 20   
 21   
 22  """ 
 23  from Bio.PopGen.GenePop import get_indiv 
 24   
 25   
26 -def read(fname):
27 """Parses a file containing a GenePop file. 28 29 fname is a file name that contains a GenePop record. 30 """ 31 record = FileRecord(fname) 32 return record
33 34
35 -class FileRecord(object):
36 """Holds information from a GenePop record. 37 38 Members: 39 40 - marker_len The marker length (2 or 3 digit code per allele). 41 42 - comment_line Comment line. 43 44 - loci_list List of loci names. 45 46 Functions: 47 48 - get_individual Returns the next individual of the current population. 49 50 - skip_population Skips the current population. 51 52 skip_population skips the individuals of the current population, returns 53 True if there are more populations. 54 55 get_individual returns an individual of the current population (or None 56 if the list ended). 57 Each individual is a pair composed by individual 58 name and a list of alleles (2 per marker or 1 for haploid data). 59 Examples:: 60 61 ('Ind1', [(1,2), (3,3), (200,201)] 62 ('Ind2', [(2,None), (3,3), (None,None)] 63 ('Other1', [(1,1), (4,3), (200,200)] 64 65 66 """
67 - def __init__(self, fname):
68 self.comment_line = "" 69 self.loci_list = [] 70 self.fname = fname 71 self.start_read()
72
73 - def __str__(self):
74 """Returns (reconstructs) a GenePop textual representation. 75 76 This might take a lot of memory. 77 Marker length will be 3. 78 """ 79 marker_len = 3 80 rep = [self.comment_line + '\n'] 81 rep.append('\n'.join(self.loci_list) + '\n') 82 current_pop = self.current_pop 83 current_ind = self.current_ind 84 self._handle.seek(0) 85 self.skip_header() 86 rep.append('Pop\n') 87 more = True 88 while more: 89 res = self.get_individual() 90 if res is True: 91 rep.append('Pop\n') 92 elif res is False: 93 more = False 94 else: 95 name, markers = res 96 rep.append(name) 97 rep.append(',') 98 for marker in markers: 99 rep.append(' ') 100 for al in marker: 101 if al is None: 102 al = '0' 103 aStr = str(al) 104 while len(aStr) < marker_len: 105 aStr = "".join(['0', aStr]) 106 rep.append(aStr) 107 rep.append('\n') 108 self.seek_position(current_pop, current_ind) 109 return "".join(rep)
110
111 - def start_read(self):
112 """Starts parsing a file containing a GenePop file. 113 """ 114 self._handle = open(self.fname) 115 self.comment_line = self._handle.readline().rstrip() 116 # We can now have one loci per line or all loci in a single line 117 # separated by either space or comma+space... 118 # We will remove all commas on loci... that should not be a problem 119 sample_loci_line = self._handle.readline().rstrip().replace(',', '') 120 all_loci = sample_loci_line.split(' ') 121 self.loci_list.extend(all_loci) 122 for line in self._handle: 123 line = line.rstrip() 124 if line.upper() == 'POP': 125 break 126 self.loci_list.append(line) 127 else: 128 raise ValueError('No population data found, file probably not GenePop related') 129 # self._after_pop = True 130 self.current_pop = 0 131 self.current_ind = 0
132
133 - def skip_header(self):
134 """Skips the Header. To be done after a re-open.""" 135 self.current_pop = 0 136 self.current_ind = 0 137 for line in self._handle: 138 if line.rstrip().upper() == "POP": 139 return
140
141 - def seek_position(self, pop, indiv):
142 """Seeks a certain position in the file. 143 144 pop - pop position (0 is first) 145 indiv - individual in pop 146 """ 147 self._handle.seek(0) 148 self.skip_header() 149 while pop > 0: 150 self.skip_population() 151 pop -= 1 152 while indiv > 0: 153 self.get_individual() 154 indiv -= 1
155
156 - def skip_population(self):
157 """Skips the current population. Returns true if there is another pop.""" 158 for line in self._handle: 159 if line == "": 160 return False 161 line = line.rstrip() 162 if line.upper() == 'POP': 163 self.current_pop += 1 164 self.current_ind = 0 165 return True
166
167 - def get_individual(self):
168 """Gets the next individual. 169 170 Returns individual information if there are more individuals 171 in the current population. 172 Returns True if there are no more individuals in the current 173 population, but there are more populations. Next read will 174 be of the following pop. 175 Returns False if at end of file. 176 """ 177 for line in self._handle: 178 line = line.rstrip() 179 if line.upper() == 'POP': 180 self.current_pop += 1 181 self.current_ind = 0 182 return True 183 else: 184 self.current_ind += 1 185 indiv_name, allele_list, ignore = get_indiv(line) 186 return indiv_name, allele_list 187 return False
188
189 - def remove_population(self, pos, fname):
190 """Removes a population (by position). 191 192 pos - position 193 fname - file to be created with population removed 194 """ 195 old_rec = read(self.fname) 196 with open(fname, "w") as f: 197 f.write(self.comment_line + "\n") 198 for locus in old_rec.loci_list: 199 f.write(locus + "\n") 200 curr_pop = 0 201 l_parser = old_rec.get_individual() 202 start_pop = True 203 while l_parser: 204 if curr_pop == pos: 205 old_rec.skip_population() 206 curr_pop += 1 207 else: 208 if l_parser is True: 209 curr_pop += 1 210 start_pop = True 211 else: 212 if start_pop: 213 f.write("POP\n") 214 start_pop = False 215 name, markers = l_parser 216 f.write(name + ",") 217 for marker in markers: 218 f.write(' ') 219 for al in marker: 220 if al is None: 221 al = '0' 222 aStr = str(al) 223 while len(aStr) < 3: 224 aStr = "".join(['0', aStr]) 225 f.write(aStr) 226 f.write('\n') 227 228 l_parser = old_rec.get_individual()
229
230 - def remove_locus_by_position(self, pos, fname):
231 """Removes a locus by position. 232 233 pos - position 234 fname - file to be created with locus removed 235 """ 236 old_rec = read(self.fname) 237 with open(fname, "w") as f: 238 f.write(self.comment_line + "\n") 239 loci_list = old_rec.loci_list 240 del loci_list[pos] 241 for locus in loci_list: 242 f.write(locus + "\n") 243 l_parser = old_rec.get_individual() 244 f.write("POP\n") 245 while l_parser: 246 if l_parser is True: 247 f.write("POP\n") 248 else: 249 name, markers = l_parser 250 f.write(name + ",") 251 marker_pos = 0 252 for marker in markers: 253 if marker_pos == pos: 254 marker_pos += 1 255 continue 256 marker_pos += 1 257 f.write(' ') 258 for al in marker: 259 if al is None: 260 al = '0' 261 aStr = str(al) 262 while len(aStr) < 3: 263 aStr = "".join(['0', aStr]) 264 f.write(aStr) 265 f.write('\n') 266 267 l_parser = old_rec.get_individual()
268
269 - def remove_loci_by_position(self, positions, fname):
270 """Removes a set of loci by position. 271 272 positions - positions 273 fname - file to be created with locus removed 274 """ 275 old_rec = read(self.fname) 276 with open(fname, "w") as f: 277 f.write(self.comment_line + "\n") 278 loci_list = old_rec.loci_list 279 positions.sort() 280 positions.reverse() 281 posSet = set() 282 for pos in positions: 283 del loci_list[pos] 284 posSet.add(pos) 285 for locus in loci_list: 286 f.write(locus + "\n") 287 l_parser = old_rec.get_individual() 288 f.write("POP\n") 289 while l_parser: 290 if l_parser is True: 291 f.write("POP\n") 292 else: 293 name, markers = l_parser 294 f.write(name + ",") 295 marker_pos = 0 296 for marker in markers: 297 if marker_pos in posSet: 298 marker_pos += 1 299 continue 300 marker_pos += 1 301 f.write(' ') 302 for al in marker: 303 if al is None: 304 al = '0' 305 aStr = str(al) 306 while len(aStr) < 3: 307 aStr = "".join(['0', aStr]) 308 f.write(aStr) 309 f.write('\n') 310 311 l_parser = old_rec.get_individual()
312
313 - def remove_locus_by_name(self, name, fname):
314 """Removes a locus by name. 315 316 name - name 317 fname - file to be created with locus removed 318 """ 319 for i in range(len(self.loci_list)): 320 if self.loci_list[i] == name: 321 self.remove_locus_by_position(i, fname) 322 return
323 # If here than locus not existent... Maybe raise exception? 324 # Although it should be Ok... Just a boolean return, maybe? 325
326 - def remove_loci_by_name(self, names, fname):
327 """Removes a loci list (by name). 328 329 names - names 330 fname - file to be created with loci removed 331 """ 332 positions = [] 333 for i in range(len(self.loci_list)): 334 if self.loci_list[i] in names: 335 positions.append(i) 336 self.remove_loci_by_position(positions, fname)
337 # If here than locus not existent... Maybe raise exception? 338 # Although it should be Ok... Just a boolean return, maybe? 339