Package Bio :: Package motifs :: Package jaspar
[hide private]
[frames] | no frames]

Source Code for Package Bio.motifs.jaspar

  1  # Copyright 2013 by Anthony Mathelier and David Arenillas. All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license. Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5   
  6  """JASPAR2014 module.""" 
  7   
  8  from Bio.Seq import Seq 
  9  from Bio.Alphabet.IUPAC import unambiguous_dna as dna 
 10  import re 
 11  import math 
 12   
 13  from Bio._py3k import range 
 14   
 15  from Bio import motifs 
16 17 18 -class Motif(motifs.Motif):
19 """A subclass of Bio.motifs.Motif used to represent a JASPAR profile. 20 21 Additional metadata information are stored if available. The metadata 22 availability depends on the source of the JASPAR motif (a 'pfm' format 23 file, a 'jaspar' format file or a JASPAR database). 24 """ 25
26 - def __init__(self, matrix_id, name, alphabet=dna, instances=None, 27 counts=None, collection=None, tf_class=None, tf_family=None, 28 species=None, tax_group=None, acc=None, data_type=None, 29 medline=None, pazar_id=None, comment=None):
30 """Construct a JASPAR Motif instance.""" 31 motifs.Motif.__init__(self, alphabet, instances, counts) 32 self.name = name 33 self.matrix_id = matrix_id 34 self.collection = collection 35 self.tf_class = tf_class 36 self.tf_family = tf_family 37 # May have multiple so species is a list. 38 # The species are actually specified as 39 # taxonomy IDs. 40 self.species = species 41 self.tax_group = tax_group 42 self.acc = acc # May have multiple so acc is a list. 43 self.data_type = data_type 44 self.medline = medline 45 self.pazar_id = pazar_id 46 self.comment = comment
47 48 @property
49 - def base_id(self):
50 """Return the JASPAR base matrix ID.""" 51 (base_id, __) = split_jaspar_id(self.matrix_id) 52 return base_id
53 54 @property
55 - def version(self):
56 """Return the JASPAR matrix version.""" 57 (__, version) = split_jaspar_id(self.matrix_id) 58 return version
59
60 - def __str__(self):
61 """Return a string represention of the JASPAR profile. 62 63 We choose to provide only the filled metadata information. 64 """ 65 tf_name_str = "TF name\t{0}\n".format(self.name) 66 matrix_id_str = "Matrix ID\t{0}\n".format(self.matrix_id) 67 the_string = "".join([tf_name_str, matrix_id_str]) 68 if self.collection: 69 collection_str = "Collection\t{0}\n".format(self.collection) 70 the_string = "".join([the_string, collection_str]) 71 if self.tf_class: 72 tf_class_str = "TF class\t{0}\n".format(self.tf_class) 73 the_string = "".join([the_string, tf_class_str]) 74 if self.tf_family: 75 tf_family_str = "TF family\t{0}\n".format(self.tf_family) 76 the_string = "".join([the_string, tf_family_str]) 77 if self.species: 78 species_str = "Species\t{0}\n".format(",".join(self.species)) 79 the_string = "".join([the_string, species_str]) 80 if self.tax_group: 81 tax_group_str = "Taxonomic group\t{0}\n".format(self.tax_group) 82 the_string = "".join([the_string, tax_group_str]) 83 if self.acc: 84 acc_str = "Accession\t{0}\n".format(self.acc) 85 the_string = "".join([the_string, acc_str]) 86 if self.data_type: 87 data_type_str = "Data type used\t{0}\n".format(self.data_type) 88 the_string = "".join([the_string, data_type_str]) 89 if self.medline: 90 medline_str = "Medline\t{0}\n".format(self.medline) 91 the_string = "".join([the_string, medline_str]) 92 if self.pazar_id: 93 pazar_id_str = "PAZAR ID\t{0}\n".format(self.pazar_id) 94 the_string = "".join([the_string, pazar_id_str]) 95 if self.comment: 96 comment_str = "Comments\t{0}\n".format(self.comment) 97 the_string = "".join([the_string, comment_str]) 98 matrix_str = "Matrix:\n{0}\n\n".format(self.counts) 99 the_string = "".join([the_string, matrix_str]) 100 return the_string
101
102 - def __hash__(self):
103 """Return the hash key corresponding to the JASPAR profile. 104 105 :note: We assume the unicity of matrix IDs 106 107 """ 108 return self.matrix_id.__hash__()
109
110 - def __eq__(self, other):
111 return self.matrix_id == other.matrix_id
112
113 114 -class Record(list):
115 """Represent a list of jaspar motifs. 116 117 Attributes: 118 - version: The JASPAR version used 119 120 """ 121
122 - def __init__(self):
123 self.version = None
124
125 - def __str__(self):
126 return "\n".join(str(the_motif) for the_motif in self)
127
128 - def to_dict(self):
129 """Return the list of matrices as a dictionary of matrices.""" 130 dic = {} 131 for motif in self: 132 dic[motif.matrix_id] = motif 133 return dic
134
135 136 -def read(handle, format):
137 """Read motif(s) from a file in one of several different JASPAR formats. 138 139 Return the record of PFM(s). 140 Call the appropriate routine based on the format passed. 141 """ 142 format = format.lower() 143 if format == "pfm": 144 record = _read_pfm(handle) 145 return record 146 elif format == "sites": 147 record = _read_sites(handle) 148 return record 149 elif format == "jaspar": 150 record = _read_jaspar(handle) 151 return record 152 else: 153 raise ValueError("Unknown JASPAR format %s" % format)
154
155 156 -def write(motifs, format):
157 """Return the representation of motifs in "pfm" or "jaspar" format.""" 158 letters = "ACGT" 159 lines = [] 160 if format == 'pfm': 161 motif = motifs[0] 162 counts = motif.counts 163 for letter in letters: 164 terms = ["{0:6.2f}".format(value) for value in counts[letter]] 165 line = "{0}\n".format(" ".join(terms)) 166 lines.append(line) 167 elif format == 'jaspar': 168 for m in motifs: 169 counts = m.counts 170 try: 171 matrix_id = m.matrix_id 172 except AttributeError: 173 matrix_id = None 174 line = ">{0} {1}\n".format(matrix_id, m.name) 175 lines.append(line) 176 for letter in letters: 177 terms = ["{0:6.2f}".format(value) for value in counts[letter]] 178 line = "{0} [{1}]\n".format(letter, " ".join(terms)) 179 lines.append(line) 180 else: 181 raise ValueError("Unknown JASPAR format %s" % format) 182 183 # Finished; glue the lines together 184 text = "".join(lines) 185 186 return text
187
188 189 -def _read_pfm(handle):
190 """Read the motif from a JASPAR .pfm file (PRIVATE).""" 191 alphabet = dna 192 counts = {} 193 194 letters = "ACGT" 195 for letter, line in zip(letters, handle): 196 words = line.split() 197 # if there is a letter in the beginning, ignore it 198 if words[0] == letter: 199 words = words[1:] 200 counts[letter] = [float(x) for x in words] 201 202 motif = Motif(matrix_id=None, name=None, alphabet=alphabet, counts=counts) 203 motif.mask = "*" * motif.length 204 record = Record() 205 record.append(motif) 206 207 return record
208
209 210 -def _read_sites(handle):
211 """Read the motif from JASPAR .sites file (PRIVATE).""" 212 alphabet = dna 213 instances = [] 214 215 for line in handle: 216 if not line.startswith(">"): 217 break 218 # line contains the header ">...." 219 # now read the actual sequence 220 line = next(handle) 221 instance = "" 222 for c in line.strip(): 223 if c == c.upper(): 224 instance += c 225 instance = Seq(instance, alphabet) 226 instances.append(instance) 227 228 instances = motifs.Instances(instances, alphabet) 229 motif = Motif( 230 matrix_id=None, name=None, alphabet=alphabet, instances=instances 231 ) 232 motif.mask = "*" * motif.length 233 record = Record() 234 record.append(motif) 235 236 return record
237
238 239 -def _read_jaspar(handle):
240 """Read motifs from a JASPAR formatted file (PRIVATE). 241 242 Format is one or more records of the form, e.g.:: 243 244 - JASPAR 2010 matrix_only format:: 245 246 >MA0001.1 AGL3 247 A [ 0 3 79 40 66 48 65 11 65 0 ] 248 C [94 75 4 3 1 2 5 2 3 3 ] 249 G [ 1 0 3 4 1 0 5 3 28 88 ] 250 T [ 2 19 11 50 29 47 22 81 1 6 ] 251 252 - JASPAR 2010-2014 PFMs format:: 253 254 >MA0001.1 AGL3 255 0 3 79 40 66 48 65 11 65 0 256 94 75 4 3 1 2 5 2 3 3 257 1 0 3 4 1 0 5 3 28 88 258 2 19 11 50 29 47 22 81 1 6 259 260 """ 261 alphabet = dna 262 counts = {} 263 264 record = Record() 265 266 head_pat = re.compile(r"^>\s*(\S+)(\s+(\S+))?") 267 row_pat_long = re.compile(r"\s*([ACGT])\s*\[\s*(.*)\s*\]") 268 row_pat_short = re.compile(r"\s*(.+)\s*") 269 270 identifier = None 271 name = None 272 row_count = 0 273 nucleotides = ['A', 'C', 'G', 'T'] 274 for line in handle: 275 line = line.strip() 276 277 head_match = head_pat.match(line) 278 row_match_long = row_pat_long.match(line) 279 row_match_short = row_pat_short.match(line) 280 281 if head_match: 282 identifier = head_match.group(1) 283 if head_match.group(3): 284 name = head_match.group(3) 285 else: 286 name = identifier 287 elif row_match_long: 288 (letter, counts_str) = row_match_long.group(1, 2) 289 words = counts_str.split() 290 counts[letter] = [float(x) for x in words] 291 row_count += 1 292 if row_count == 4: 293 record.append(Motif(identifier, name, alphabet=alphabet, 294 counts=counts)) 295 identifier = None 296 name = None 297 counts = {} 298 row_count = 0 299 elif row_match_short: 300 words = row_match_short.group(1).split() 301 counts[nucleotides[row_count]] = [float(x) for x in words] 302 row_count += 1 303 if row_count == 4: 304 record.append(Motif(identifier, name, alphabet=alphabet, 305 counts=counts)) 306 identifier = None 307 name = None 308 counts = {} 309 row_count = 0 310 311 return record
312
313 314 -def calculate_pseudocounts(motif):
315 alphabet = motif.alphabet 316 background = motif.background 317 318 # It is possible to have unequal column sums so use the average 319 # number of instances. 320 total = 0 321 for i in range(motif.length): 322 total += sum(float(motif.counts[letter][i]) 323 for letter in alphabet.letters) 324 325 avg_nb_instances = total / motif.length 326 sq_nb_instances = math.sqrt(avg_nb_instances) 327 328 if background: 329 background = dict(background) 330 else: 331 background = dict.fromkeys(sorted(alphabet.letters), 1.0) 332 333 total = sum(background.values()) 334 pseudocounts = {} 335 336 for letter in alphabet.letters: 337 background[letter] /= total 338 pseudocounts[letter] = sq_nb_instances * background[letter] 339 340 return pseudocounts
341
342 343 -def split_jaspar_id(id):
344 """Utility function to split a JASPAR matrix ID into its component. 345 346 Components are base ID and version number, e.g. 'MA0047.2' is returned as 347 ('MA0047', 2). 348 """ 349 id_split = id.split('.') 350 351 base_id = None 352 version = None 353 if len(id_split) == 2: 354 base_id = id_split[0] 355 version = id_split[1] 356 else: 357 base_id = id 358 359 return (base_id, version)
360