Package Bio :: Package PopGen :: Package GenePop :: Module FileParser
[hide private]
[frames] | no frames]

Source Code for Module Bio.PopGen.GenePop.FileParser

  1  # Copyright 2010 by Tiago Antao.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5   
  6  """ 
  7  This class provides code to parse BIG GenePop files. 
  8   
  9  The difference between this class and the standard Bio.PopGen.GenePop.Record 
 10  class is that this one does not read the whole file to memory. 
 11  It provides an iterator interface, slower but consuming much mess memory. 
 12  Should be used with big files (Thousands of markers and individuals). 
 13   
 14  See http://wbiomed.curtin.edu.au/genepop/ , the format is documented 
 15  here: http://wbiomed.curtin.edu.au/genepop/help_input.html . 
 16   
 17  Classes: 
 18  FileRecord           Holds GenePop data. 
 19   
 20  Functions: 
 21   
 22   
 23  """ 
 24  from Bio.PopGen.GenePop import get_indiv 
 25   
 26  __docformat__ = "restructuredtext en" 
 27   
 28   
29 -def read(fname):
30 """Parses a file containing a GenePop file. 31 32 fname is a file name that contains a GenePop record. 33 """ 34 record = FileRecord(fname) 35 return record
36 37
38 -class FileRecord(object):
39 """Holds information from a GenePop record. 40 41 Members: 42 43 - marker_len The marker length (2 or 3 digit code per allele). 44 45 - comment_line Comment line. 46 47 - loci_list List of loci names. 48 49 Functions: 50 51 - get_individual Returns the next individual of the current population. 52 53 - skip_population Skips the current population. 54 55 skip_population skips the individuals of the current population, returns 56 True if there are more populations. 57 58 get_individual returns an individual of the current population (or None 59 if the list ended). 60 Each individual is a pair composed by individual 61 name and a list of alleles (2 per marker or 1 for haploid data). 62 Examples:: 63 64 ('Ind1', [(1,2), (3,3), (200,201)] 65 ('Ind2', [(2,None), (3,3), (None,None)] 66 ('Other1', [(1,1), (4,3), (200,200)] 67 68 69 """
70 - def __init__(self, fname):
71 self.comment_line = "" 72 self.loci_list = [] 73 self.fname = fname 74 self.start_read()
75
76 - def __str__(self):
77 """Returns (reconstructs) a GenePop textual representation. 78 79 This might take a lot of memory. 80 Marker length will be 3. 81 """ 82 marker_len = 3 83 rep = [self.comment_line + '\n'] 84 rep.append('\n'.join(self.loci_list) + '\n') 85 current_pop = self.current_pop 86 current_ind = self.current_ind 87 self._handle.seek(0) 88 self.skip_header() 89 rep.append('Pop\n') 90 more = True 91 while more: 92 res = self.get_individual() 93 if res is True: 94 rep.append('Pop\n') 95 elif res is False: 96 more = False 97 else: 98 name, markers = res 99 rep.append(name) 100 rep.append(',') 101 for marker in markers: 102 rep.append(' ') 103 for al in marker: 104 if al is None: 105 al = '0' 106 aStr = str(al) 107 while len(aStr) < marker_len: 108 aStr = "".join(['0', aStr]) 109 rep.append(aStr) 110 rep.append('\n') 111 self.seek_position(current_pop, current_ind) 112 return "".join(rep)
113
114 - def start_read(self):
115 """Starts parsing a file containing a GenePop file. 116 """ 117 self._handle = open(self.fname) 118 self.comment_line = self._handle.readline().rstrip() 119 # We can now have one loci per line or all loci in a single line 120 # separated by either space or comma+space... 121 # We will remove all commas on loci... that should not be a problem 122 sample_loci_line = self._handle.readline().rstrip().replace(',', '') 123 all_loci = sample_loci_line.split(' ') 124 self.loci_list.extend(all_loci) 125 for line in self._handle: 126 line = line.rstrip() 127 if line.upper() == 'POP': 128 break 129 self.loci_list.append(line) 130 else: 131 raise ValueError('No population data found, file probably not GenePop related') 132 # self._after_pop = True 133 self.current_pop = 0 134 self.current_ind = 0
135
136 - def skip_header(self):
137 """Skips the Header. To be done after a re-open.""" 138 self.current_pop = 0 139 self.current_ind = 0 140 for line in self._handle: 141 if line.rstrip().upper() == "POP": 142 return
143
144 - def seek_position(self, pop, indiv):
145 """Seeks a certain position in the file. 146 147 pop - pop position (0 is first) 148 indiv - individual in pop 149 """ 150 self._handle.seek(0) 151 self.skip_header() 152 while pop > 0: 153 self.skip_population() 154 pop -= 1 155 while indiv > 0: 156 self.get_individual() 157 indiv -= 1
158
159 - def skip_population(self):
160 "Skips the current population. Returns true if there is another pop." 161 for line in self._handle: 162 if line == "": 163 return False 164 line = line.rstrip() 165 if line.upper() == 'POP': 166 self.current_pop += 1 167 self.current_ind = 0 168 return True
169
170 - def get_individual(self):
171 """Gets the next individual. 172 173 Returns individual information if there are more individuals 174 in the current population. 175 Returns True if there are no more individuals in the current 176 population, but there are more populations. Next read will 177 be of the following pop. 178 Returns False if at end of file. 179 """ 180 for line in self._handle: 181 line = line.rstrip() 182 if line.upper() == 'POP': 183 self.current_pop += 1 184 self.current_ind = 0 185 return True 186 else: 187 self.current_ind += 1 188 indiv_name, allele_list, ignore = get_indiv(line) 189 return (indiv_name, allele_list) 190 return False
191
192 - def remove_population(self, pos, fname):
193 """Removes a population (by position). 194 195 pos - position 196 fname - file to be created with population removed 197 """ 198 old_rec = read(self.fname) 199 f = open(fname, "w") 200 f.write(self.comment_line + "\n") 201 for locus in old_rec.loci_list: 202 f.write(locus + "\n") 203 curr_pop = 0 204 l_parser = old_rec.get_individual() 205 start_pop = True 206 while l_parser: 207 if curr_pop == pos: 208 old_rec.skip_population() 209 curr_pop += 1 210 else: 211 if l_parser is True: 212 curr_pop += 1 213 start_pop = True 214 else: 215 if start_pop: 216 f.write("POP\n") 217 start_pop = False 218 name, markers = l_parser 219 f.write(name + ",") 220 for marker in markers: 221 f.write(' ') 222 for al in marker: 223 if al is None: 224 al = '0' 225 aStr = str(al) 226 while len(aStr) < 3: 227 aStr = "".join(['0', aStr]) 228 f.write(aStr) 229 f.write('\n') 230 231 l_parser = old_rec.get_individual() 232 f.close()
233
234 - def remove_locus_by_position(self, pos, fname):
235 """Removes a locus by position. 236 237 pos - position 238 fname - file to be created with locus removed 239 """ 240 old_rec = read(self.fname) 241 f = open(fname, "w") 242 f.write(self.comment_line + "\n") 243 loci_list = old_rec.loci_list 244 del loci_list[pos] 245 for locus in loci_list: 246 f.write(locus + "\n") 247 l_parser = old_rec.get_individual() 248 f.write("POP\n") 249 while l_parser: 250 if l_parser is True: 251 f.write("POP\n") 252 else: 253 name, markers = l_parser 254 f.write(name + ",") 255 marker_pos = 0 256 for marker in markers: 257 if marker_pos == pos: 258 marker_pos += 1 259 continue 260 marker_pos += 1 261 f.write(' ') 262 for al in marker: 263 if al is None: 264 al = '0' 265 aStr = str(al) 266 while len(aStr) < 3: 267 aStr = "".join(['0', aStr]) 268 f.write(aStr) 269 f.write('\n') 270 271 l_parser = old_rec.get_individual() 272 f.close()
273
274 - def remove_loci_by_position(self, positions, fname):
275 """Removes a set of loci by position. 276 277 positions - positions 278 fname - file to be created with locus removed 279 """ 280 old_rec = read(self.fname) 281 f = open(fname, "w") 282 f.write(self.comment_line + "\n") 283 loci_list = old_rec.loci_list 284 positions.sort() 285 positions.reverse() 286 posSet = set() 287 for pos in positions: 288 del loci_list[pos] 289 posSet.add(pos) 290 for locus in loci_list: 291 f.write(locus + "\n") 292 l_parser = old_rec.get_individual() 293 f.write("POP\n") 294 while l_parser: 295 if l_parser is True: 296 f.write("POP\n") 297 else: 298 name, markers = l_parser 299 f.write(name + ",") 300 marker_pos = 0 301 for marker in markers: 302 if marker_pos in posSet: 303 marker_pos += 1 304 continue 305 marker_pos += 1 306 f.write(' ') 307 for al in marker: 308 if al is None: 309 al = '0' 310 aStr = str(al) 311 while len(aStr) < 3: 312 aStr = "".join(['0', aStr]) 313 f.write(aStr) 314 f.write('\n') 315 316 l_parser = old_rec.get_individual() 317 f.close()
318
319 - def remove_locus_by_name(self, name, fname):
320 """Removes a locus by name. 321 322 name - name 323 fname - file to be created with locus removed 324 """ 325 for i in range(len(self.loci_list)): 326 if self.loci_list[i] == name: 327 self.remove_locus_by_position(i, fname) 328 return
329 # If here than locus not existent... Maybe raise exception? 330 # Although it should be Ok... Just a boolean return, maybe? 331
332 - def remove_loci_by_name(self, names, fname):
333 """Removes a loci list (by name). 334 335 names - names 336 fname - file to be created with loci removed 337 """ 338 positions = [] 339 for i in range(len(self.loci_list)): 340 if self.loci_list[i] in names: 341 positions.append(i) 342 self.remove_loci_by_position(positions, fname)
343 # If here than locus not existent... Maybe raise exception? 344 # Although it should be Ok... Just a boolean return, maybe? 345