Package Bio :: Package PopGen :: Package GenePop :: Module FileParser
[hide private]
[frames] | no frames]

Source Code for Module Bio.PopGen.GenePop.FileParser

  1  # Copyright 2010 by Tiago Antao.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5   
  6  """Code to parse BIG GenePop files. 
  7   
  8  The difference between this class and the standard Bio.PopGen.GenePop.Record 
  9  class is that this one does not read the whole file to memory. 
 10  It provides an iterator interface, slower but consuming much mess memory. 
 11  Should be used with big files (Thousands of markers and individuals). 
 12   
 13  See http://wbiomed.curtin.edu.au/genepop/ , the format is documented 
 14  here: http://wbiomed.curtin.edu.au/genepop/help_input.html . 
 15   
 16  Classes: 
 17   - FileRecord           Holds GenePop data. 
 18   
 19  Functions: 
 20   
 21   
 22  """ 
 23  from Bio.PopGen.GenePop import get_indiv 
 24   
 25   
26 -def read(fname):
27 """Parses a file containing a GenePop file. 28 29 fname is a file name that contains a GenePop record. 30 """ 31 record = FileRecord(fname) 32 return record
33 34
35 -class FileRecord(object):
36 """Holds information from a GenePop record. 37 38 Attributes: 39 - marker_len The marker length (2 or 3 digit code per allele). 40 - comment_line Comment line. 41 - loci_list List of loci names. 42 43 Methods: 44 - get_individual Returns the next individual of the current population. 45 - skip_population Skips the current population. 46 47 skip_population skips the individuals of the current population, returns 48 True if there are more populations. 49 50 get_individual returns an individual of the current population (or None 51 if the list ended). 52 53 Each individual is a pair composed by individual name and a list of alleles 54 (2 per marker or 1 for haploid data). Examples:: 55 56 ('Ind1', [(1,2), (3,3), (200,201)] 57 ('Ind2', [(2,None), (3,3), (None,None)] 58 ('Other1', [(1,1), (4,3), (200,200)] 59 60 """ 61
62 - def __init__(self, fname):
63 self.comment_line = "" 64 self.loci_list = [] 65 self.fname = fname 66 self.start_read()
67
68 - def __str__(self):
69 """Returns (reconstructs) a GenePop textual representation. 70 71 This might take a lot of memory. 72 Marker length will be 3. 73 """ 74 marker_len = 3 75 rep = [self.comment_line + '\n'] 76 rep.append('\n'.join(self.loci_list) + '\n') 77 current_pop = self.current_pop 78 current_ind = self.current_ind 79 self._handle.seek(0) 80 self.skip_header() 81 rep.append('Pop\n') 82 more = True 83 while more: 84 res = self.get_individual() 85 if res is True: 86 rep.append('Pop\n') 87 elif res is False: 88 more = False 89 else: 90 name, markers = res 91 rep.append(name) 92 rep.append(',') 93 for marker in markers: 94 rep.append(' ') 95 for al in marker: 96 if al is None: 97 al = '0' 98 aStr = str(al) 99 while len(aStr) < marker_len: 100 aStr = "".join(['0', aStr]) 101 rep.append(aStr) 102 rep.append('\n') 103 self.seek_position(current_pop, current_ind) 104 return "".join(rep)
105
106 - def start_read(self):
107 """Starts parsing a file containing a GenePop file.""" 108 self._handle = open(self.fname) 109 self.comment_line = self._handle.readline().rstrip() 110 # We can now have one loci per line or all loci in a single line 111 # separated by either space or comma+space... 112 # We will remove all commas on loci... that should not be a problem 113 sample_loci_line = self._handle.readline().rstrip().replace(',', '') 114 all_loci = sample_loci_line.split(' ') 115 self.loci_list.extend(all_loci) 116 for line in self._handle: 117 line = line.rstrip() 118 if line.upper() == 'POP': 119 break 120 self.loci_list.append(line) 121 else: 122 raise ValueError('No population data found, file probably not GenePop related') 123 # self._after_pop = True 124 self.current_pop = 0 125 self.current_ind = 0
126
127 - def skip_header(self):
128 """Skips the Header. To be done after a re-open.""" 129 self.current_pop = 0 130 self.current_ind = 0 131 for line in self._handle: 132 if line.rstrip().upper() == "POP": 133 return
134
135 - def seek_position(self, pop, indiv):
136 """Seeks a certain position in the file. 137 138 Arguments: 139 - pop - pop position (0 is first) 140 - indiv - individual in pop 141 142 """ 143 self._handle.seek(0) 144 self.skip_header() 145 while pop > 0: 146 self.skip_population() 147 pop -= 1 148 while indiv > 0: 149 self.get_individual() 150 indiv -= 1
151
152 - def skip_population(self):
153 """Skips the current population. Returns true if there is another pop.""" 154 for line in self._handle: 155 if line == "": 156 return False 157 line = line.rstrip() 158 if line.upper() == 'POP': 159 self.current_pop += 1 160 self.current_ind = 0 161 return True
162
163 - def get_individual(self):
164 """Gets the next individual. 165 166 Returns individual information if there are more individuals 167 in the current population. 168 Returns True if there are no more individuals in the current 169 population, but there are more populations. Next read will 170 be of the following pop. 171 Returns False if at end of file. 172 """ 173 for line in self._handle: 174 line = line.rstrip() 175 if line.upper() == 'POP': 176 self.current_pop += 1 177 self.current_ind = 0 178 return True 179 else: 180 self.current_ind += 1 181 indiv_name, allele_list, ignore = get_indiv(line) 182 return indiv_name, allele_list 183 return False
184
185 - def remove_population(self, pos, fname):
186 """Removes a population (by position). 187 188 Arguments: 189 - pos - position 190 - fname - file to be created with population removed 191 192 """ 193 old_rec = read(self.fname) 194 with open(fname, "w") as f: 195 f.write(self.comment_line + "\n") 196 for locus in old_rec.loci_list: 197 f.write(locus + "\n") 198 curr_pop = 0 199 l_parser = old_rec.get_individual() 200 start_pop = True 201 while l_parser: 202 if curr_pop == pos: 203 old_rec.skip_population() 204 curr_pop += 1 205 else: 206 if l_parser is True: 207 curr_pop += 1 208 start_pop = True 209 else: 210 if start_pop: 211 f.write("POP\n") 212 start_pop = False 213 name, markers = l_parser 214 f.write(name + ",") 215 for marker in markers: 216 f.write(' ') 217 for al in marker: 218 if al is None: 219 al = '0' 220 aStr = str(al) 221 while len(aStr) < 3: 222 aStr = "".join(['0', aStr]) 223 f.write(aStr) 224 f.write('\n') 225 226 l_parser = old_rec.get_individual()
227
228 - def remove_locus_by_position(self, pos, fname):
229 """Removes a locus by position. 230 231 Arguments: 232 - pos - position 233 - fname - file to be created with locus removed 234 235 """ 236 old_rec = read(self.fname) 237 with open(fname, "w") as f: 238 f.write(self.comment_line + "\n") 239 loci_list = old_rec.loci_list 240 del loci_list[pos] 241 for locus in loci_list: 242 f.write(locus + "\n") 243 l_parser = old_rec.get_individual() 244 f.write("POP\n") 245 while l_parser: 246 if l_parser is True: 247 f.write("POP\n") 248 else: 249 name, markers = l_parser 250 f.write(name + ",") 251 marker_pos = 0 252 for marker in markers: 253 if marker_pos == pos: 254 marker_pos += 1 255 continue 256 marker_pos += 1 257 f.write(' ') 258 for al in marker: 259 if al is None: 260 al = '0' 261 aStr = str(al) 262 while len(aStr) < 3: 263 aStr = "".join(['0', aStr]) 264 f.write(aStr) 265 f.write('\n') 266 267 l_parser = old_rec.get_individual()
268
269 - def remove_loci_by_position(self, positions, fname):
270 """Removes a set of loci by position. 271 272 Arguments: 273 - positions - positions 274 - fname - file to be created with locus removed 275 276 """ 277 old_rec = read(self.fname) 278 with open(fname, "w") as f: 279 f.write(self.comment_line + "\n") 280 loci_list = old_rec.loci_list 281 positions.sort() 282 positions.reverse() 283 posSet = set() 284 for pos in positions: 285 del loci_list[pos] 286 posSet.add(pos) 287 for locus in loci_list: 288 f.write(locus + "\n") 289 l_parser = old_rec.get_individual() 290 f.write("POP\n") 291 while l_parser: 292 if l_parser is True: 293 f.write("POP\n") 294 else: 295 name, markers = l_parser 296 f.write(name + ",") 297 marker_pos = 0 298 for marker in markers: 299 if marker_pos in posSet: 300 marker_pos += 1 301 continue 302 marker_pos += 1 303 f.write(' ') 304 for al in marker: 305 if al is None: 306 al = '0' 307 aStr = str(al) 308 while len(aStr) < 3: 309 aStr = "".join(['0', aStr]) 310 f.write(aStr) 311 f.write('\n') 312 313 l_parser = old_rec.get_individual()
314
315 - def remove_locus_by_name(self, name, fname):
316 """Removes a locus by name. 317 318 Arguments: 319 - name - name 320 - fname - file to be created with locus removed 321 322 """ 323 for i in range(len(self.loci_list)): 324 if self.loci_list[i] == name: 325 self.remove_locus_by_position(i, fname) 326 return
327 # If here than locus not existent... Maybe raise exception? 328 # Although it should be Ok... Just a boolean return, maybe? 329
330 - def remove_loci_by_name(self, names, fname):
331 """Removes a loci list (by name). 332 333 Arguments: 334 - names - names 335 - fname - file to be created with loci removed 336 337 """ 338 positions = [] 339 for i in range(len(self.loci_list)): 340 if self.loci_list[i] in names: 341 positions.append(i) 342 self.remove_loci_by_position(positions, fname)
343 # If here than locus not existent... Maybe raise exception? 344 # Although it should be Ok... Just a boolean return, maybe? 345