Package Bio :: Package Affy :: Module CelFile
[hide private]
[frames] | no frames]

Source Code for Module Bio.Affy.CelFile

  1  # Copyright 2004 by Harry Zuzan. All rights reserved. 
  2  # Copyright 2016 by Adam Kurkiewicz. All rights reserved. 
  3  # This code is part of the Biopython distribution and governed by its 
  4  # license.  Please see the LICENSE file that should have been included 
  5  # as part of this package. 
  6   
  7  """ 
  8  Reading information from Affymetrix CEL files version 3 and 4. 
  9  """ 
 10   
 11  from __future__ import print_function 
 12  import sys 
 13  import struct 
 14   
 15  try: 
 16      import numpy 
 17  except ImportError: 
 18      from Bio import MissingPythonDependencyError 
 19      raise MissingPythonDependencyError( 
 20          "Install NumPy if you want to use Bio.Affy.CelFile") 
 21   
 22   
23 -class ParserError(ValueError):
24 - def __init__(self, *args):
25 super(ParserError, self).__init__(*args)
26 27 _modeError = ParserError("You're trying to open an Affymetrix v4" 28 " CEL file. You have to use a read binary mode," 29 " like this `open(filename \"rb\")`.") 30 31 # for debugging 32 # import pprint 33 # pp = pprint.PrettyPrinter(indent=4) 34 35
36 -class Record(object):
37 """Stores the information in a cel file 38 39 Example usage: 40 41 >>> from Bio.Affy import CelFile 42 >>> with open("Affy/affy_v3_example.CEL", "r") as handle: 43 ... c = CelFile.read(handle) 44 ... 45 >>> print(c.ncols, c.nrows) 46 5 5 47 >>> print(c.intensities) 48 [[ 234. 170. 22177. 164. 22104.] 49 [ 188. 188. 21871. 168. 21883.] 50 [ 188. 193. 21455. 198. 21300.] 51 [ 188. 182. 21438. 188. 20945.] 52 [ 193. 20370. 174. 20605. 168.]] 53 >>> print(c.stdevs) 54 [[ 24. 34.5 2669. 19.7 3661.2] 55 [ 29.8 29.8 2795.9 67.9 2792.4] 56 [ 29.8 88.7 2976.5 62. 2914.5] 57 [ 29.8 76.2 2759.5 49.2 2762. ] 58 [ 38.8 2611.8 26.6 2810.7 24.1]] 59 >>> print(c.npix) 60 [[25 25 25 25 25] 61 [25 25 25 25 25] 62 [25 25 25 25 25] 63 [25 25 25 25 25] 64 [25 25 25 25 25]] 65 66 """
67 - def __init__(self):
68 self.version = None 69 self.GridCornerUL = None 70 self.GridCornerUR = None 71 self.GridCornerLR = None 72 self.GridCornerLL = None 73 self.DatHeader = None 74 self.Algorithm = None 75 self.AlgorithmParameters = None 76 self.NumberCells = None 77 self.intensities = None 78 self.stdevs = None 79 self.npix = None 80 self.nrows = None 81 self.ncols = None 82 self.nmask = None 83 self.mask = None 84 self.noutliers = None 85 self.outliers = None 86 self.modified = None
87 88
89 -def read(handle):
90 """ Reads Affymetrix CEL file and returns Record object. 91 92 CEL files version 3 and 4 are supported, and the parser attempts version detection. 93 94 Example Usage: 95 96 >>> from Bio.Affy import CelFile 97 >>> with open("Affy/affy_v4_example.CEL", "rb") as handle: 98 ... c = CelFile.read(handle) 99 ... 100 >>> c.version == 4 101 True 102 """ 103 # If we fail to read the magic number, then it will remain None, and thus 104 # we will invoke read_v3 (if mode is not strict), or raise IOError if mode 105 # is strict. 106 magicNumber = None 107 # We check if the handle is a file-like object. If it isn't, and the mode 108 # is strict, we raise an error. If it isn't and the mode isn't strict, we 109 # continue (perhaps somebody has got a CEL-file-like iterable, which used 110 # to work with previous versions of biopython and we don't want to maintain 111 # backwards compatibility). 112 try: 113 mode = handle.mode 114 # By definition an Affymetrix v4 CEL file has 64 as the first 4 bytes. 115 # Note that we use little-endian irrespective of the platform, again by 116 # definition. 117 position = handle.tell() 118 magicNumber = struct.unpack("<i", handle.read(4))[0] 119 except (AttributeError, TypeError): 120 pass 121 except UnicodeDecodeError: 122 raise _modeError 123 finally: 124 try: 125 # reset the offset, to avoid breaking either v3 or v4. 126 handle.seek(position) 127 except AttributeError: 128 pass 129 if magicNumber != 64: 130 return read_v3(handle) 131 132 else: 133 return read_v4(handle)
134 135 136 # read Affymetrix files version 4.
137 -def read_v4(f):
138 """ Reads Affymetrix CEL file, version 4, and returns a corresponding Record 139 object. 140 141 Most importantly record.intensities correspond to intensities from the CEL 142 file. 143 144 record.mask and record.outliers are not set. 145 146 Example Usage: 147 148 >>> from Bio.Affy import CelFile 149 >>> with open("Affy/affy_v4_example.CEL", "rb") as handle: 150 ... c = CelFile.read_v4(handle) 151 ... 152 >>> c.version == 4 153 True 154 >>> print(c.intensities.shape) 155 (5, 5) 156 """ 157 # We follow the documentation here: 158 # http://www.affymetrix.com/estore/support/developer/powertools/changelog/gcos-agcc/cel.html.affx 159 record = Record() 160 preHeaders = ["magic", "version", "columns", "rows", "cellNo", "headerLen"] 161 preHeadersMap = dict() 162 headersMap = dict() 163 164 # load pre-headers 165 try: 166 for name in preHeaders: 167 preHeadersMap[name] = struct.unpack("<i", f.read(4))[0] 168 except UnicodeDecodeError as e: 169 raise _modeError 170 171 char = f.read(preHeadersMap["headerLen"]) 172 header = char.decode("ascii", "ignore") 173 for header in header.split("\n"): 174 if "=" in header: 175 header = header.split("=") 176 headersMap[header[0]] = "=".join(header[1:]) 177 178 # for debugging 179 # pp.pprint("preHeadersMap") 180 # pp.pprint(preHeadersMap) 181 # pp.pprint("headersMap") 182 # pp.pprint(headersMap) 183 184 record.version = preHeadersMap["version"] 185 if record.version != 4: 186 raise ParserError("You are trying to parse CEL file version 4. This" 187 " file violates the structure expected from CEL file" 188 " version 4") 189 record.GridCornerUL = headersMap["GridCornerUL"] 190 record.GridCornerUR = headersMap["GridCornerUR"] 191 record.GridCornerLR = headersMap["GridCornerLR"] 192 record.GridCornerLL = headersMap["GridCornerLL"] 193 record.DatHeader = headersMap["DatHeader"] 194 record.Algorithm = headersMap["Algorithm"] 195 record.AlgorithmParameters = headersMap["AlgorithmParameters"] 196 record.NumberCells = preHeadersMap["cellNo"] 197 # record.intensities are set below 198 # record.stdevs are set below 199 # record.npix are set below 200 record.nrows = int(headersMap["Rows"]) 201 record.ncols = int(headersMap["Cols"]) 202 203 # These cannot be reliably set in v4, because of discrepancies between real 204 # data and the documented format. 205 record.nmask = None 206 record.mask = None 207 record.noutliers = None 208 record.outliers = None 209 record.modified = None 210 211 # Real data never seems to have anything but zeros here, but we don't want 212 # to take chances. Raising an error is better than returning unreliable 213 # data. 214 def raiseBadHeader(field, expected): 215 actual = int(headersMap[field]) 216 message = "The header {field} is expected to be 0, not {value}".format(value=actual, field=field) 217 if actual != expected: 218 raise ParserError(message)
219 220 raiseBadHeader("Axis-invertX", 0) 221 222 raiseBadHeader("AxisInvertY", 0) 223 224 raiseBadHeader("OffsetX", 0) 225 226 raiseBadHeader("OffsetY", 0) 227 228 # This is unfortunately undocumented, but it turns out that real data has 229 # the `record.AlgorithmParameters` repeated in the data section, until an 230 # EOF, i.e. b"\x04". 231 char = b"\x00" 232 safetyValve = 10**4 233 for i in range(safetyValve): 234 char = f.read(1) 235 # For debugging 236 # print([i for i in char], end="") 237 if char == b"\x04": 238 break 239 if i == safetyValve: 240 raise ParserError("Parse Error. The parser expects a short, " 241 "undocumented binary blob terminating with " 242 "ASCII EOF, x04") 243 244 # After that there are precisely 15 bytes padded. Again, undocumented. 245 padding = f.read(15) 246 247 # That's how we pull out the values (triplets of the form float, float, 248 # signed short). 249 structa = struct.Struct("< f f h") 250 251 # There are 10 bytes in our struct. 252 structSize = 10 253 254 # We initialise the most important: intensities, stdevs and npixs. 255 record.intensities = numpy.empty(record.NumberCells, dtype=float) 256 record.stdevs = numpy.empty(record.NumberCells, dtype=float) 257 record.npix = numpy.empty(record.NumberCells, dtype=int) 258 259 b = f.read(structSize * record.NumberCells) 260 for i in range(record.NumberCells): 261 binaryFragment = b[i * structSize: (i + 1) * structSize] 262 intensity, stdevs, npix = structa.unpack(binaryFragment) 263 record.intensities[i] = intensity 264 record.stdevs[i] = stdevs 265 record.npix[i] = npix 266 267 # reshape without copying. 268 def reshape(array): 269 view = array.view() 270 view.shape = (record.nrows, record.ncols) 271 return view 272 273 record.intensities = reshape(record.intensities) 274 record.stdevs = reshape(record.stdevs) 275 record.npix = reshape(record.npix) 276 277 return record 278 279
280 -def read_v3(handle):
281 """ Reads Affymetrix CEL file, version 3, and returns a corresponding Record object. 282 283 Example Usage: 284 285 >>> from Bio.Affy import CelFile 286 >>> with open("Affy/affy_v3_example.CEL", "r") as handle: 287 ... c = CelFile.read_v3(handle) 288 ... 289 >>> c.version == 3 290 True 291 """ 292 # Needs error handling. 293 # Needs to know the chip design. 294 record = Record() 295 section = "" 296 for line in handle: 297 if not line.strip(): 298 continue 299 # Set current section 300 if line[:5] == "[CEL]": 301 section = "CEL" 302 elif line[:8] == "[HEADER]": 303 section = "HEADER" 304 elif line[:11] == "[INTENSITY]": 305 section = "INTENSITY" 306 record.intensities = numpy.zeros((record.nrows, record.ncols)) 307 record.stdevs = numpy.zeros((record.nrows, record.ncols)) 308 record.npix = numpy.zeros((record.nrows, record.ncols), int) 309 elif line[:7] == "[MASKS]": 310 section = "MASKS" 311 record.mask = numpy.zeros((record.nrows, record.ncols)) 312 elif line[:10] == "[OUTLIERS]": 313 section = "OUTLIERS" 314 record.outliers = numpy.zeros((record.nrows, record.ncols)) 315 elif line[:10] == "[MODIFIED]": 316 section = "MODIFIED" 317 record.modified = numpy.zeros((record.nrows, record.ncols)) 318 elif line[0] == "[": 319 # This would be an unknown section 320 section = "" 321 elif section == "CEL": 322 keyword, value = line.split("=", 1) 323 if keyword == "Version": 324 record.version = int(value) 325 elif section == "HEADER": 326 # Set record.ncols and record.nrows, remaining data goes into 327 # record.header dict 328 keyword, value = line.split("=", 1) 329 if keyword == "Cols": 330 record.ncols = int(value) 331 elif keyword == "Rows": 332 record.nrows = int(value) 333 elif keyword == "GridCornerUL": 334 x, y = value.split() 335 record.GridCornerUL = (int(x), int(y)) 336 elif keyword == "GridCornerUR": 337 x, y = value.split() 338 record.GridCornerUR = (int(x), int(y)) 339 elif keyword == "GridCornerLR": 340 x, y = value.split() 341 record.GridCornerLR = (int(x), int(y)) 342 elif keyword == "GridCornerLL": 343 x, y = value.split() 344 record.GridCornerLL = (int(x), int(y)) 345 elif keyword == "DatHeader": 346 record.DatHeader = value.strip("\n\r") 347 elif keyword == "Algorithm": 348 record.Algorithm = value.strip("\n\r") 349 elif keyword == "AlgorithmParameters": 350 record.AlgorithmParameters = value.strip("\n\r") 351 elif section == "INTENSITY": 352 if "NumberCells" in line: 353 record.NumberCells = int(line.split("=", 1)[1]) 354 elif "CellHeader" in line: 355 pass 356 else: 357 words = line.split() 358 y = int(words[0]) 359 x = int(words[1]) 360 record.intensities[x, y] = float(words[2]) 361 record.stdevs[x, y] = float(words[3]) 362 record.npix[x, y] = int(words[4]) 363 elif section == "MASKS": 364 if "NumberCells" in line: 365 record.nmask = int(line.split("=", 1)[1]) 366 elif "CellHeader" in line: 367 pass 368 else: 369 words = line.split() 370 y = int(words[0]) 371 x = int(words[1]) 372 record.mask[x, y] = int(1) 373 elif section == "OUTLIERS": 374 if "NumberCells" in line: 375 record.noutliers = int(line.split("=", 1)[1]) 376 elif "CellHeader" in line: 377 pass 378 else: 379 words = line.split() 380 y = int(words[0]) 381 x = int(words[1]) 382 record.outliers[x, y] = int(1) 383 elif section == "MODIFIED": 384 if "NumberCells" in line: 385 record.nmodified = int(line.split("=", 1)[1]) 386 elif "CellHeader" in line: 387 pass 388 else: 389 words = line.split() 390 y = int(words[0]) 391 x = int(words[1]) 392 record.modified[x, y] = float(words[2]) 393 else: 394 continue 395 return record
396 397 if __name__ == "__main__": 398 from Bio._utils import run_doctest 399 run_doctest() 400