Package Bio :: Package PopGen :: Package GenePop :: Module FileParser
[hide private]
[frames] | no frames]

Source Code for Module Bio.PopGen.GenePop.FileParser

  1  # Copyright 2010 by Tiago Antao.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5   
  6  """ 
  7  This class provides code to parse BIG GenePop files. 
  8   
  9  The difference between this class and the standard Bio.PopGen.GenePop.Record 
 10  class is that this one does not read the whole file to memory. 
 11  It provides an iterator interface, slower but consuming much mess memory. 
 12  Should be used with big files (Thousands of markers and individuals). 
 13   
 14  See http://wbiomed.curtin.edu.au/genepop/ , the format is documented 
 15  here: http://wbiomed.curtin.edu.au/genepop/help_input.html . 
 16   
 17  Classes: 
 18  FileRecord           Holds GenePop data. 
 19   
 20  Functions: 
 21   
 22   
 23  """ 
 24  from Bio.PopGen.GenePop import get_indiv 
 25   
 26   
27 -def read(fname):
28 """Parses a file containing a GenePop file. 29 30 fname is a file name that contains a GenePop record. 31 """ 32 record = FileRecord(fname) 33 return record
34 35
36 -class FileRecord(object):
37 """Holds information from a GenePop record. 38 39 Members: 40 marker_len The marker length (2 or 3 digit code per allele). 41 42 comment_line Comment line. 43 44 loci_list List of loci names. 45 46 Functions: 47 get_individual Returns the next individual of the current population. 48 49 skip_population Skips the current population. 50 51 skip_population skips the individuals of the current population, returns 52 True if there are more populations. 53 54 get_individual returns an individual of the current population (or None 55 if the list ended). 56 Each individual is a pair composed by individual 57 name and a list of alleles (2 per marker or 1 for haploid data). 58 Examples 59 ('Ind1', [(1,2), (3,3), (200,201)] 60 ('Ind2', [(2,None), (3,3), (None,None)] 61 ('Other1', [(1,1), (4,3), (200,200)] 62 63 64 """
65 - def __init__(self, fname):
66 self.comment_line = "" 67 self.loci_list = [] 68 self.fname = fname 69 self.start_read()
70
71 - def __del__(self):
72 try: 73 self._handle.close() 74 except AttributeError: 75 pass
76
77 - def __str__(self):
78 """Returns (reconstructs) a GenePop textual representation. 79 80 This might take a lot of memory. 81 Marker length will be 3. 82 """ 83 marker_len = 3 84 rep = [self.comment_line + '\n'] 85 rep.append('\n'.join(self.loci_list) + '\n') 86 current_pop = self.current_pop 87 current_ind = self.current_ind 88 self._handle.seek(0) 89 self.skip_header() 90 rep.append('Pop\n') 91 more = True 92 while more: 93 res = self.get_individual() 94 if res is True: 95 rep.append('Pop\n') 96 elif res is False: 97 more = False 98 else: 99 name, markers = res 100 rep.append(name) 101 rep.append(',') 102 for marker in markers: 103 rep.append(' ') 104 for al in marker: 105 if al is None: 106 al = '0' 107 aStr = str(al) 108 while len(aStr)<marker_len: 109 aStr = "".join(['0', aStr]) 110 rep.append(aStr) 111 rep.append('\n') 112 self.seek_position(current_pop, current_ind) 113 return "".join(rep)
114
115 - def start_read(self):
116 """Starts parsing a file containing a GenePop file. 117 """ 118 self._handle = open(self.fname) 119 self.comment_line = self._handle.readline().rstrip() 120 #We can now have one loci per line or all loci in a single line 121 #separated by either space or comma+space... 122 #We will remove all commas on loci... that should not be a problem 123 sample_loci_line = self._handle.readline().rstrip().replace(',', '') 124 all_loci = sample_loci_line.split(' ') 125 self.loci_list.extend(all_loci) 126 for line in self._handle: 127 line = line.rstrip() 128 if line.upper()=='POP': 129 break 130 self.loci_list.append(line) 131 else: 132 raise ValueError('No population data found, file probably not GenePop related') 133 #self._after_pop = True 134 self.current_pop = 0 135 self.current_ind = 0
136
137 - def skip_header(self):
138 """Skips the Header. To be done after a re-open.""" 139 self.current_pop = 0 140 self.current_ind = 0 141 for line in self._handle: 142 if line.rstrip().upper()=="POP": 143 return
144
145 - def seek_position(self, pop, indiv):
146 """Seeks a certain position in the file. 147 148 pop - pop position (0 is first) 149 indiv - individual in pop 150 """ 151 self._handle.seek(0) 152 self.skip_header() 153 while pop>0: 154 self.skip_population() 155 pop -= 1 156 while indiv>0: 157 self.get_individual() 158 indiv -= 1
159
160 - def skip_population(self):
161 "Skips the current population. Returns true if there is another pop." 162 for line in self._handle: 163 if line=="": 164 return False 165 line = line.rstrip() 166 if line.upper()=='POP': 167 self.current_pop += 1 168 self.current_ind = 0 169 return True
170
171 - def get_individual(self):
172 """Gets the next individual. 173 174 Returns individual information if there are more individuals 175 in the current population. 176 Returns True if there are no more individuals in the current 177 population, but there are more populations. Next read will 178 be of the following pop. 179 Returns False if at end of file. 180 """ 181 for line in self._handle: 182 line = line.rstrip() 183 if line.upper()=='POP': 184 self.current_pop += 1 185 self.current_ind = 0 186 return True 187 else: 188 self.current_ind += 1 189 indiv_name, allele_list, ignore = get_indiv(line) 190 return (indiv_name, allele_list) 191 return False
192
193 - def remove_population(self, pos, fname):
194 """Removes a population (by position). 195 196 pos - position 197 fname - file to be created with population removed 198 """ 199 old_rec = read(self.fname) 200 with open(fname, "w") as f: 201 f.write(self.comment_line + "\n") 202 for locus in old_rec.loci_list: 203 f.write(locus + "\n") 204 curr_pop = 0 205 l_parser = old_rec.get_individual() 206 start_pop = True 207 while l_parser: 208 if curr_pop == pos: 209 old_rec.skip_population() 210 curr_pop += 1 211 else: 212 if l_parser is True: 213 curr_pop += 1 214 start_pop = True 215 else: 216 if start_pop: 217 f.write("POP\n") 218 start_pop = False 219 name, markers = l_parser 220 f.write(name + ",") 221 for marker in markers: 222 f.write(' ') 223 for al in marker: 224 if al is None: 225 al = '0' 226 aStr = str(al) 227 while len(aStr)<3: 228 aStr = "".join(['0', aStr]) 229 f.write(aStr) 230 f.write('\n') 231 232 l_parser = old_rec.get_individual()
233
234 - def remove_locus_by_position(self, pos, fname):
235 """Removes a locus by position. 236 237 pos - position 238 fname - file to be created with locus removed 239 """ 240 old_rec = read(self.fname) 241 with open(fname, "w") as f: 242 f.write(self.comment_line + "\n") 243 loci_list = old_rec.loci_list 244 del loci_list[pos] 245 for locus in loci_list: 246 f.write(locus + "\n") 247 l_parser = old_rec.get_individual() 248 f.write("POP\n") 249 while l_parser: 250 if l_parser is True: 251 f.write("POP\n") 252 else: 253 name, markers = l_parser 254 f.write(name + ",") 255 marker_pos = 0 256 for marker in markers: 257 if marker_pos == pos: 258 marker_pos += 1 259 continue 260 marker_pos += 1 261 f.write(' ') 262 for al in marker: 263 if al is None: 264 al = '0' 265 aStr = str(al) 266 while len(aStr)<3: 267 aStr = "".join(['0', aStr]) 268 f.write(aStr) 269 f.write('\n') 270 271 l_parser = old_rec.get_individual()
272
273 - def remove_loci_by_position(self, positions, fname):
274 """Removes a set of loci by position. 275 276 positions - positions 277 fname - file to be created with locus removed 278 """ 279 old_rec = read(self.fname) 280 with open(fname, "w") as f: 281 f.write(self.comment_line + "\n") 282 loci_list = old_rec.loci_list 283 positions.sort() 284 positions.reverse() 285 posSet = set() 286 for pos in positions: 287 del loci_list[pos] 288 posSet.add(pos) 289 for locus in loci_list: 290 f.write(locus + "\n") 291 l_parser = old_rec.get_individual() 292 f.write("POP\n") 293 while l_parser: 294 if l_parser is True: 295 f.write("POP\n") 296 else: 297 name, markers = l_parser 298 f.write(name + ",") 299 marker_pos = 0 300 for marker in markers: 301 if marker_pos in posSet: 302 marker_pos += 1 303 continue 304 marker_pos += 1 305 f.write(' ') 306 for al in marker: 307 if al is None: 308 al = '0' 309 aStr = str(al) 310 while len(aStr)<3: 311 aStr = "".join(['0', aStr]) 312 f.write(aStr) 313 f.write('\n') 314 315 l_parser = old_rec.get_individual()
316
317 - def remove_locus_by_name(self, name, fname):
318 """Removes a locus by name. 319 320 name - name 321 fname - file to be created with locus removed 322 """ 323 for i in range(len(self.loci_list)): 324 if self.loci_list[i] == name: 325 self.remove_locus_by_position(i, fname) 326 return
327 #If here than locus not existent... Maybe raise exception? 328 # Although it should be Ok... Just a boolean return, maybe? 329
330 - def remove_loci_by_name(self, names, fname):
331 """Removes a loci list (by name). 332 333 names - names 334 fname - file to be created with loci removed 335 """ 336 positions = [] 337 for i in range(len(self.loci_list)): 338 if self.loci_list[i] in names: 339 positions.append(i) 340 self.remove_loci_by_position(positions, fname)
341 #If here than locus not existent... Maybe raise exception? 342 # Although it should be Ok... Just a boolean return, maybe? 343