Package Bio :: Package PopGen :: Package GenePop :: Module FileParser
[hide private]
[frames] | no frames]

Source Code for Module Bio.PopGen.GenePop.FileParser

  1  # Copyright 2010 by Tiago Antao.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5   
  6  """ 
  7  This class provides code to parse BIG GenePop files. 
  8   
  9  The difference between this class and the standard Bio.PopGen.GenePop.Record 
 10  class is that this one does not read the whole file to memory. 
 11  It provides an iterator interface, slower but consuming much mess memory. 
 12  Should be used with big files (Thousands of markers and individuals). 
 13   
 14  See http://wbiomed.curtin.edu.au/genepop/ , the format is documented 
 15  here: http://wbiomed.curtin.edu.au/genepop/help_input.html . 
 16   
 17  Classes: 
 18  FileRecord           Holds GenePop data. 
 19   
 20  Functions: 
 21   
 22   
 23  """ 
 24  from Bio.PopGen.GenePop import get_indiv 
 25   
 26   
27 -def read(fname):
28 """Parses a file containing a GenePop file. 29 30 fname is a file name that contains a GenePop record. 31 """ 32 record = FileRecord(fname) 33 return record
34 35
36 -class FileRecord(object):
37 """Holds information from a GenePop record. 38 39 Members: 40 41 - marker_len The marker length (2 or 3 digit code per allele). 42 43 - comment_line Comment line. 44 45 - loci_list List of loci names. 46 47 Functions: 48 49 - get_individual Returns the next individual of the current population. 50 51 - skip_population Skips the current population. 52 53 skip_population skips the individuals of the current population, returns 54 True if there are more populations. 55 56 get_individual returns an individual of the current population (or None 57 if the list ended). 58 Each individual is a pair composed by individual 59 name and a list of alleles (2 per marker or 1 for haploid data). 60 Examples:: 61 62 ('Ind1', [(1,2), (3,3), (200,201)] 63 ('Ind2', [(2,None), (3,3), (None,None)] 64 ('Other1', [(1,1), (4,3), (200,200)] 65 66 67 """
68 - def __init__(self, fname):
69 self.comment_line = "" 70 self.loci_list = [] 71 self.fname = fname 72 self.start_read()
73
74 - def __str__(self):
75 """Returns (reconstructs) a GenePop textual representation. 76 77 This might take a lot of memory. 78 Marker length will be 3. 79 """ 80 marker_len = 3 81 rep = [self.comment_line + '\n'] 82 rep.append('\n'.join(self.loci_list) + '\n') 83 current_pop = self.current_pop 84 current_ind = self.current_ind 85 self._handle.seek(0) 86 self.skip_header() 87 rep.append('Pop\n') 88 more = True 89 while more: 90 res = self.get_individual() 91 if res is True: 92 rep.append('Pop\n') 93 elif res is False: 94 more = False 95 else: 96 name, markers = res 97 rep.append(name) 98 rep.append(',') 99 for marker in markers: 100 rep.append(' ') 101 for al in marker: 102 if al is None: 103 al = '0' 104 aStr = str(al) 105 while len(aStr) < marker_len: 106 aStr = "".join(['0', aStr]) 107 rep.append(aStr) 108 rep.append('\n') 109 self.seek_position(current_pop, current_ind) 110 return "".join(rep)
111
112 - def start_read(self):
113 """Starts parsing a file containing a GenePop file. 114 """ 115 self._handle = open(self.fname) 116 self.comment_line = self._handle.readline().rstrip() 117 # We can now have one loci per line or all loci in a single line 118 # separated by either space or comma+space... 119 # We will remove all commas on loci... that should not be a problem 120 sample_loci_line = self._handle.readline().rstrip().replace(',', '') 121 all_loci = sample_loci_line.split(' ') 122 self.loci_list.extend(all_loci) 123 for line in self._handle: 124 line = line.rstrip() 125 if line.upper() == 'POP': 126 break 127 self.loci_list.append(line) 128 else: 129 raise ValueError('No population data found, file probably not GenePop related') 130 # self._after_pop = True 131 self.current_pop = 0 132 self.current_ind = 0
133
134 - def skip_header(self):
135 """Skips the Header. To be done after a re-open.""" 136 self.current_pop = 0 137 self.current_ind = 0 138 for line in self._handle: 139 if line.rstrip().upper() == "POP": 140 return
141
142 - def seek_position(self, pop, indiv):
143 """Seeks a certain position in the file. 144 145 pop - pop position (0 is first) 146 indiv - individual in pop 147 """ 148 self._handle.seek(0) 149 self.skip_header() 150 while pop > 0: 151 self.skip_population() 152 pop -= 1 153 while indiv > 0: 154 self.get_individual() 155 indiv -= 1
156
157 - def skip_population(self):
158 "Skips the current population. Returns true if there is another pop." 159 for line in self._handle: 160 if line == "": 161 return False 162 line = line.rstrip() 163 if line.upper() == 'POP': 164 self.current_pop += 1 165 self.current_ind = 0 166 return True
167
168 - def get_individual(self):
169 """Gets the next individual. 170 171 Returns individual information if there are more individuals 172 in the current population. 173 Returns True if there are no more individuals in the current 174 population, but there are more populations. Next read will 175 be of the following pop. 176 Returns False if at end of file. 177 """ 178 for line in self._handle: 179 line = line.rstrip() 180 if line.upper() == 'POP': 181 self.current_pop += 1 182 self.current_ind = 0 183 return True 184 else: 185 self.current_ind += 1 186 indiv_name, allele_list, ignore = get_indiv(line) 187 return indiv_name, allele_list 188 return False
189
190 - def remove_population(self, pos, fname):
191 """Removes a population (by position). 192 193 pos - position 194 fname - file to be created with population removed 195 """ 196 old_rec = read(self.fname) 197 with open(fname, "w") as f: 198 f.write(self.comment_line + "\n") 199 for locus in old_rec.loci_list: 200 f.write(locus + "\n") 201 curr_pop = 0 202 l_parser = old_rec.get_individual() 203 start_pop = True 204 while l_parser: 205 if curr_pop == pos: 206 old_rec.skip_population() 207 curr_pop += 1 208 else: 209 if l_parser is True: 210 curr_pop += 1 211 start_pop = True 212 else: 213 if start_pop: 214 f.write("POP\n") 215 start_pop = False 216 name, markers = l_parser 217 f.write(name + ",") 218 for marker in markers: 219 f.write(' ') 220 for al in marker: 221 if al is None: 222 al = '0' 223 aStr = str(al) 224 while len(aStr) < 3: 225 aStr = "".join(['0', aStr]) 226 f.write(aStr) 227 f.write('\n') 228 229 l_parser = old_rec.get_individual()
230
231 - def remove_locus_by_position(self, pos, fname):
232 """Removes a locus by position. 233 234 pos - position 235 fname - file to be created with locus removed 236 """ 237 old_rec = read(self.fname) 238 with open(fname, "w") as f: 239 f.write(self.comment_line + "\n") 240 loci_list = old_rec.loci_list 241 del loci_list[pos] 242 for locus in loci_list: 243 f.write(locus + "\n") 244 l_parser = old_rec.get_individual() 245 f.write("POP\n") 246 while l_parser: 247 if l_parser is True: 248 f.write("POP\n") 249 else: 250 name, markers = l_parser 251 f.write(name + ",") 252 marker_pos = 0 253 for marker in markers: 254 if marker_pos == pos: 255 marker_pos += 1 256 continue 257 marker_pos += 1 258 f.write(' ') 259 for al in marker: 260 if al is None: 261 al = '0' 262 aStr = str(al) 263 while len(aStr) < 3: 264 aStr = "".join(['0', aStr]) 265 f.write(aStr) 266 f.write('\n') 267 268 l_parser = old_rec.get_individual()
269
270 - def remove_loci_by_position(self, positions, fname):
271 """Removes a set of loci by position. 272 273 positions - positions 274 fname - file to be created with locus removed 275 """ 276 old_rec = read(self.fname) 277 with open(fname, "w") as f: 278 f.write(self.comment_line + "\n") 279 loci_list = old_rec.loci_list 280 positions.sort() 281 positions.reverse() 282 posSet = set() 283 for pos in positions: 284 del loci_list[pos] 285 posSet.add(pos) 286 for locus in loci_list: 287 f.write(locus + "\n") 288 l_parser = old_rec.get_individual() 289 f.write("POP\n") 290 while l_parser: 291 if l_parser is True: 292 f.write("POP\n") 293 else: 294 name, markers = l_parser 295 f.write(name + ",") 296 marker_pos = 0 297 for marker in markers: 298 if marker_pos in posSet: 299 marker_pos += 1 300 continue 301 marker_pos += 1 302 f.write(' ') 303 for al in marker: 304 if al is None: 305 al = '0' 306 aStr = str(al) 307 while len(aStr) < 3: 308 aStr = "".join(['0', aStr]) 309 f.write(aStr) 310 f.write('\n') 311 312 l_parser = old_rec.get_individual()
313
314 - def remove_locus_by_name(self, name, fname):
315 """Removes a locus by name. 316 317 name - name 318 fname - file to be created with locus removed 319 """ 320 for i in range(len(self.loci_list)): 321 if self.loci_list[i] == name: 322 self.remove_locus_by_position(i, fname) 323 return
324 # If here than locus not existent... Maybe raise exception? 325 # Although it should be Ok... Just a boolean return, maybe? 326
327 - def remove_loci_by_name(self, names, fname):
328 """Removes a loci list (by name). 329 330 names - names 331 fname - file to be created with loci removed 332 """ 333 positions = [] 334 for i in range(len(self.loci_list)): 335 if self.loci_list[i] in names: 336 positions.append(i) 337 self.remove_loci_by_position(positions, fname)
338 # If here than locus not existent... Maybe raise exception? 339 # Although it should be Ok... Just a boolean return, maybe? 340