Package Bio :: Package PopGen :: Package GenePop :: Module FileParser
[hide private]
[frames] | no frames]

Source Code for Module Bio.PopGen.GenePop.FileParser

  1  # Copyright 2010 by Tiago Antao.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5   
  6  """ 
  7  This class provides code to parse BIG GenePop files. 
  8   
  9  The difference between this class and the standard Bio.PopGen.GenePop.Record 
 10  class is that this one does not read the whole file to memory. 
 11  It provides an iterator interface, slower but consuming much mess memory. 
 12  Should be used with big files (Thousands of markers and individuals). 
 13   
 14  See http://wbiomed.curtin.edu.au/genepop/ , the format is documented 
 15  here: http://wbiomed.curtin.edu.au/genepop/help_input.html . 
 16   
 17  Classes: 
 18  FileRecord           Holds GenePop data. 
 19   
 20  Functions: 
 21   
 22   
 23  """ 
 24  from Bio.PopGen.GenePop import get_indiv 
 25   
 26  __docformat__ = "restructuredtext en" 
 27   
28 -def read(fname):
29 """Parses a file containing a GenePop file. 30 31 fname is a file name that contains a GenePop record. 32 """ 33 record = FileRecord(fname) 34 return record
35 36
37 -class FileRecord(object):
38 """Holds information from a GenePop record. 39 40 Members: 41 42 - marker_len The marker length (2 or 3 digit code per allele). 43 44 - comment_line Comment line. 45 46 - loci_list List of loci names. 47 48 Functions: 49 50 - get_individual Returns the next individual of the current population. 51 52 - skip_population Skips the current population. 53 54 skip_population skips the individuals of the current population, returns 55 True if there are more populations. 56 57 get_individual returns an individual of the current population (or None 58 if the list ended). 59 Each individual is a pair composed by individual 60 name and a list of alleles (2 per marker or 1 for haploid data). 61 Examples:: 62 63 ('Ind1', [(1,2), (3,3), (200,201)] 64 ('Ind2', [(2,None), (3,3), (None,None)] 65 ('Other1', [(1,1), (4,3), (200,200)] 66 67 68 """
69 - def __init__(self, fname):
70 self.comment_line = "" 71 self.loci_list = [] 72 self.fname = fname 73 self.start_read()
74
75 - def __str__(self):
76 """Returns (reconstructs) a GenePop textual representation. 77 78 This might take a lot of memory. 79 Marker length will be 3. 80 """ 81 marker_len = 3 82 rep = [self.comment_line + '\n'] 83 rep.append('\n'.join(self.loci_list) + '\n') 84 current_pop = self.current_pop 85 current_ind = self.current_ind 86 self._handle.seek(0) 87 self.skip_header() 88 rep.append('Pop\n') 89 more = True 90 while more: 91 res = self.get_individual() 92 if res is True: 93 rep.append('Pop\n') 94 elif res is False: 95 more = False 96 else: 97 name, markers = res 98 rep.append(name) 99 rep.append(',') 100 for marker in markers: 101 rep.append(' ') 102 for al in marker: 103 if al is None: 104 al = '0' 105 aStr = str(al) 106 while len(aStr)<marker_len: 107 aStr = "".join(['0', aStr]) 108 rep.append(aStr) 109 rep.append('\n') 110 self.seek_position(current_pop, current_ind) 111 return "".join(rep)
112
113 - def start_read(self):
114 """Starts parsing a file containing a GenePop file. 115 """ 116 self._handle = open(self.fname) 117 self.comment_line = self._handle.readline().rstrip() 118 # We can now have one loci per line or all loci in a single line 119 # separated by either space or comma+space... 120 # We will remove all commas on loci... that should not be a problem 121 sample_loci_line = self._handle.readline().rstrip().replace(',', '') 122 all_loci = sample_loci_line.split(' ') 123 self.loci_list.extend(all_loci) 124 for line in self._handle: 125 line = line.rstrip() 126 if line.upper()=='POP': 127 break 128 self.loci_list.append(line) 129 else: 130 raise ValueError('No population data found, file probably not GenePop related') 131 # self._after_pop = True 132 self.current_pop = 0 133 self.current_ind = 0
134
135 - def skip_header(self):
136 """Skips the Header. To be done after a re-open.""" 137 self.current_pop = 0 138 self.current_ind = 0 139 for line in self._handle: 140 if line.rstrip().upper()=="POP": 141 return
142
143 - def seek_position(self, pop, indiv):
144 """Seeks a certain position in the file. 145 146 pop - pop position (0 is first) 147 indiv - individual in pop 148 """ 149 self._handle.seek(0) 150 self.skip_header() 151 while pop>0: 152 self.skip_population() 153 pop -= 1 154 while indiv>0: 155 self.get_individual() 156 indiv -= 1
157
158 - def skip_population(self):
159 "Skips the current population. Returns true if there is another pop." 160 for line in self._handle: 161 if line=="": 162 return False 163 line = line.rstrip() 164 if line.upper()=='POP': 165 self.current_pop += 1 166 self.current_ind = 0 167 return True
168
169 - def get_individual(self):
170 """Gets the next individual. 171 172 Returns individual information if there are more individuals 173 in the current population. 174 Returns True if there are no more individuals in the current 175 population, but there are more populations. Next read will 176 be of the following pop. 177 Returns False if at end of file. 178 """ 179 for line in self._handle: 180 line = line.rstrip() 181 if line.upper()=='POP': 182 self.current_pop += 1 183 self.current_ind = 0 184 return True 185 else: 186 self.current_ind += 1 187 indiv_name, allele_list, ignore = get_indiv(line) 188 return (indiv_name, allele_list) 189 return False
190
191 - def remove_population(self, pos, fname):
192 """Removes a population (by position). 193 194 pos - position 195 fname - file to be created with population removed 196 """ 197 old_rec = read(self.fname) 198 f = open(fname, "w") 199 f.write(self.comment_line + "\n") 200 for locus in old_rec.loci_list: 201 f.write(locus + "\n") 202 curr_pop = 0 203 l_parser = old_rec.get_individual() 204 start_pop = True 205 while l_parser: 206 if curr_pop == pos: 207 old_rec.skip_population() 208 curr_pop += 1 209 else: 210 if l_parser is True: 211 curr_pop += 1 212 start_pop = True 213 else: 214 if start_pop: 215 f.write("POP\n") 216 start_pop = False 217 name, markers = l_parser 218 f.write(name + ",") 219 for marker in markers: 220 f.write(' ') 221 for al in marker: 222 if al is None: 223 al = '0' 224 aStr = str(al) 225 while len(aStr)<3: 226 aStr = "".join(['0', aStr]) 227 f.write(aStr) 228 f.write('\n') 229 230 l_parser = old_rec.get_individual() 231 f.close()
232
233 - def remove_locus_by_position(self, pos, fname):
234 """Removes a locus by position. 235 236 pos - position 237 fname - file to be created with locus removed 238 """ 239 old_rec = read(self.fname) 240 f = open(fname, "w") 241 f.write(self.comment_line + "\n") 242 loci_list = old_rec.loci_list 243 del loci_list[pos] 244 for locus in loci_list: 245 f.write(locus + "\n") 246 l_parser = old_rec.get_individual() 247 f.write("POP\n") 248 while l_parser: 249 if l_parser is True: 250 f.write("POP\n") 251 else: 252 name, markers = l_parser 253 f.write(name + ",") 254 marker_pos = 0 255 for marker in markers: 256 if marker_pos == pos: 257 marker_pos += 1 258 continue 259 marker_pos += 1 260 f.write(' ') 261 for al in marker: 262 if al is None: 263 al = '0' 264 aStr = str(al) 265 while len(aStr)<3: 266 aStr = "".join(['0', aStr]) 267 f.write(aStr) 268 f.write('\n') 269 270 l_parser = old_rec.get_individual() 271 f.close()
272
273 - def remove_loci_by_position(self, positions, fname):
274 """Removes a set of loci by position. 275 276 positions - positions 277 fname - file to be created with locus removed 278 """ 279 old_rec = read(self.fname) 280 f = open(fname, "w") 281 f.write(self.comment_line + "\n") 282 loci_list = old_rec.loci_list 283 positions.sort() 284 positions.reverse() 285 posSet = set() 286 for pos in positions: 287 del loci_list[pos] 288 posSet.add(pos) 289 for locus in loci_list: 290 f.write(locus + "\n") 291 l_parser = old_rec.get_individual() 292 f.write("POP\n") 293 while l_parser: 294 if l_parser is True: 295 f.write("POP\n") 296 else: 297 name, markers = l_parser 298 f.write(name + ",") 299 marker_pos = 0 300 for marker in markers: 301 if marker_pos in posSet: 302 marker_pos += 1 303 continue 304 marker_pos += 1 305 f.write(' ') 306 for al in marker: 307 if al is None: 308 al = '0' 309 aStr = str(al) 310 while len(aStr)<3: 311 aStr = "".join(['0', aStr]) 312 f.write(aStr) 313 f.write('\n') 314 315 l_parser = old_rec.get_individual() 316 f.close()
317
318 - def remove_locus_by_name(self, name, fname):
319 """Removes a locus by name. 320 321 name - name 322 fname - file to be created with locus removed 323 """ 324 for i in range(len(self.loci_list)): 325 if self.loci_list[i] == name: 326 self.remove_locus_by_position(i, fname) 327 return
328 # If here than locus not existent... Maybe raise exception? 329 # Although it should be Ok... Just a boolean return, maybe? 330
331 - def remove_loci_by_name(self, names, fname):
332 """Removes a loci list (by name). 333 334 names - names 335 fname - file to be created with loci removed 336 """ 337 positions = [] 338 for i in range(len(self.loci_list)): 339 if self.loci_list[i] in names: 340 positions.append(i) 341 self.remove_loci_by_position(positions, fname)
342 # If here than locus not existent... Maybe raise exception? 343 # Although it should be Ok... Just a boolean return, maybe? 344