Package Bio :: Package PopGen :: Package GenePop :: Module FileParser
[hide private]
[frames] | no frames]

Source Code for Module Bio.PopGen.GenePop.FileParser

  1  # Copyright 2010 by Tiago Antao.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5   
  6  """ 
  7  This class provides code to parse BIG GenePop files. 
  8   
  9  The difference between this class and the standard Bio.PopGen.GenePop.Record 
 10  class is that this one does not read the whole file to memory. 
 11  It provides an iterator interface, slower but consuming much mess memory. 
 12  Should be used with big files (Thousands of markers and individuals). 
 13   
 14  See http://wbiomed.curtin.edu.au/genepop/ , the format is documented 
 15  here: http://wbiomed.curtin.edu.au/genepop/help_input.html . 
 16   
 17  Classes: 
 18  FileRecord           Holds GenePop data. 
 19   
 20  Functions: 
 21   
 22   
 23  """ 
 24  from Bio.PopGen.GenePop import get_indiv 
 25   
 26   
27 -def read(fname):
28 """Parses a file containing a GenePop file. 29 30 fname is a file name that contains a GenePop record. 31 """ 32 record = FileRecord(fname) 33 return record
34 35
36 -class FileRecord(object):
37 """Holds information from a GenePop record. 38 39 Members: 40 marker_len The marker length (2 or 3 digit code per allele). 41 42 comment_line Comment line. 43 44 loci_list List of loci names. 45 46 Functions: 47 get_individual Returns the next individual of the current population. 48 49 skip_population Skips the current population. 50 51 skip_population skips the individuals of the current population, returns 52 True if there are more populations. 53 54 get_individual returns an individual of the current population (or None 55 if the list ended). 56 Each individual is a pair composed by individual 57 name and a list of alleles (2 per marker or 1 for haploid data). 58 Examples 59 ('Ind1', [(1,2), (3,3), (200,201)] 60 ('Ind2', [(2,None), (3,3), (None,None)] 61 ('Other1', [(1,1), (4,3), (200,200)] 62 63 64 """
65 - def __init__(self, fname):
66 self.comment_line = "" 67 self.loci_list = [] 68 self.fname = fname 69 self.start_read()
70
71 - def __str__(self):
72 """Returns (reconstructs) a GenePop textual representation. 73 74 This might take a lot of memory. 75 Marker length will be 3. 76 """ 77 marker_len = 3 78 rep = [self.comment_line + '\n'] 79 rep.append('\n'.join(self.loci_list) + '\n') 80 current_pop = self.current_pop 81 current_ind = self.current_ind 82 self._handle.seek(0) 83 self.skip_header() 84 rep.append('Pop\n') 85 more = True 86 while more: 87 res = self.get_individual() 88 if res is True: 89 rep.append('Pop\n') 90 elif res is False: 91 more = False 92 else: 93 name, markers = res 94 rep.append(name) 95 rep.append(',') 96 for marker in markers: 97 rep.append(' ') 98 for al in marker: 99 if al is None: 100 al = '0' 101 aStr = str(al) 102 while len(aStr)<marker_len: 103 aStr = "".join(['0', aStr]) 104 rep.append(aStr) 105 rep.append('\n') 106 self.seek_position(current_pop, current_ind) 107 return "".join(rep)
108
109 - def start_read(self):
110 """Starts parsing a file containing a GenePop file. 111 """ 112 self._handle = open(self.fname) 113 self.comment_line = self._handle.readline().rstrip() 114 #We can now have one loci per line or all loci in a single line 115 #separated by either space or comma+space... 116 #We will remove all commas on loci... that should not be a problem 117 sample_loci_line = self._handle.readline().rstrip().replace(',', '') 118 all_loci = sample_loci_line.split(' ') 119 self.loci_list.extend(all_loci) 120 for line in self._handle: 121 line = line.rstrip() 122 if line.upper()=='POP': 123 break 124 self.loci_list.append(line) 125 else: 126 raise ValueError('No population data found, file probably not GenePop related') 127 #self._after_pop = True 128 self.current_pop = 0 129 self.current_ind = 0
130
131 - def skip_header(self):
132 """Skips the Header. To be done after a re-open.""" 133 self.current_pop = 0 134 self.current_ind = 0 135 for line in self._handle: 136 if line.rstrip().upper()=="POP": 137 return
138
139 - def seek_position(self, pop, indiv):
140 """Seeks a certain position in the file. 141 142 pop - pop position (0 is first) 143 indiv - individual in pop 144 """ 145 self._handle.seek(0) 146 self.skip_header() 147 while pop>0: 148 self.skip_population() 149 pop -= 1 150 while indiv>0: 151 self.get_individual() 152 indiv -= 1
153
154 - def skip_population(self):
155 "Skips the current population. Returns true if there is another pop." 156 for line in self._handle: 157 if line=="": 158 return False 159 line = line.rstrip() 160 if line.upper()=='POP': 161 self.current_pop += 1 162 self.current_ind = 0 163 return True
164
165 - def get_individual(self):
166 """Gets the next individual. 167 168 Returns individual information if there are more individuals 169 in the current population. 170 Returns True if there are no more individuals in the current 171 population, but there are more populations. Next read will 172 be of the following pop. 173 Returns False if at end of file. 174 """ 175 for line in self._handle: 176 line = line.rstrip() 177 if line.upper()=='POP': 178 self.current_pop += 1 179 self.current_ind = 0 180 return True 181 else: 182 self.current_ind += 1 183 indiv_name, allele_list, ignore = get_indiv(line) 184 return (indiv_name, allele_list) 185 return False
186
187 - def remove_population(self, pos, fname):
188 """Removes a population (by position). 189 190 pos - position 191 fname - file to be created with population removed 192 """ 193 old_rec = read(self.fname) 194 f = open(fname, "w") 195 f.write(self.comment_line + "\n") 196 for locus in old_rec.loci_list: 197 f.write(locus + "\n") 198 curr_pop = 0 199 l_parser = old_rec.get_individual() 200 start_pop = True 201 while l_parser: 202 if curr_pop == pos: 203 old_rec.skip_population() 204 curr_pop += 1 205 else: 206 if l_parser is True: 207 curr_pop += 1 208 start_pop = True 209 else: 210 if start_pop: 211 f.write("POP\n") 212 start_pop = False 213 name, markers = l_parser 214 f.write(name + ",") 215 for marker in markers: 216 f.write(' ') 217 for al in marker: 218 if al is None: 219 al = '0' 220 aStr = str(al) 221 while len(aStr)<3: 222 aStr = "".join(['0', aStr]) 223 f.write(aStr) 224 f.write('\n') 225 226 l_parser = old_rec.get_individual() 227 f.close()
228
229 - def remove_locus_by_position(self, pos, fname):
230 """Removes a locus by position. 231 232 pos - position 233 fname - file to be created with locus removed 234 """ 235 old_rec = read(self.fname) 236 f = open(fname, "w") 237 f.write(self.comment_line + "\n") 238 loci_list = old_rec.loci_list 239 del loci_list[pos] 240 for locus in loci_list: 241 f.write(locus + "\n") 242 l_parser = old_rec.get_individual() 243 f.write("POP\n") 244 while l_parser: 245 if l_parser is True: 246 f.write("POP\n") 247 else: 248 name, markers = l_parser 249 f.write(name + ",") 250 marker_pos = 0 251 for marker in markers: 252 if marker_pos == pos: 253 marker_pos += 1 254 continue 255 marker_pos += 1 256 f.write(' ') 257 for al in marker: 258 if al is None: 259 al = '0' 260 aStr = str(al) 261 while len(aStr)<3: 262 aStr = "".join(['0', aStr]) 263 f.write(aStr) 264 f.write('\n') 265 266 l_parser = old_rec.get_individual() 267 f.close()
268
269 - def remove_loci_by_position(self, positions, fname):
270 """Removes a set of loci by position. 271 272 positions - positions 273 fname - file to be created with locus removed 274 """ 275 old_rec = read(self.fname) 276 f = open(fname, "w") 277 f.write(self.comment_line + "\n") 278 loci_list = old_rec.loci_list 279 positions.sort() 280 positions.reverse() 281 posSet = set() 282 for pos in positions: 283 del loci_list[pos] 284 posSet.add(pos) 285 for locus in loci_list: 286 f.write(locus + "\n") 287 l_parser = old_rec.get_individual() 288 f.write("POP\n") 289 while l_parser: 290 if l_parser is True: 291 f.write("POP\n") 292 else: 293 name, markers = l_parser 294 f.write(name + ",") 295 marker_pos = 0 296 for marker in markers: 297 if marker_pos in posSet: 298 marker_pos += 1 299 continue 300 marker_pos += 1 301 f.write(' ') 302 for al in marker: 303 if al is None: 304 al = '0' 305 aStr = str(al) 306 while len(aStr)<3: 307 aStr = "".join(['0', aStr]) 308 f.write(aStr) 309 f.write('\n') 310 311 l_parser = old_rec.get_individual() 312 f.close()
313
314 - def remove_locus_by_name(self, name, fname):
315 """Removes a locus by name. 316 317 name - name 318 fname - file to be created with locus removed 319 """ 320 for i in range(len(self.loci_list)): 321 if self.loci_list[i] == name: 322 self.remove_locus_by_position(i, fname) 323 return
324 #If here than locus not existent... Maybe raise exception? 325 # Although it should be Ok... Just a boolean return, maybe? 326
327 - def remove_loci_by_name(self, names, fname):
328 """Removes a loci list (by name). 329 330 names - names 331 fname - file to be created with loci removed 332 """ 333 positions = [] 334 for i in range(len(self.loci_list)): 335 if self.loci_list[i] in names: 336 positions.append(i) 337 self.remove_loci_by_position(positions, fname)
338 #If here than locus not existent... Maybe raise exception? 339 # Although it should be Ok... Just a boolean return, maybe? 340