Package Bio :: Package motifs :: Module transfac
[hide private]
[frames] | no frames]

Source Code for Module Bio.motifs.transfac

  1  # Copyright 2003 by Bartek Wilczynski.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5   
  6  """Parsing TRANSFAC files 
  7  """ 
  8   
  9  import warnings 
 10   
 11  from Bio import BiopythonParserWarning 
 12  from Bio import motifs 
 13  from Bio.Alphabet import IUPAC 
 14   
 15   
16 -class Motif(motifs.Motif, dict):
17 """A Bio.motifs.transfac.Motif stores the information in one TRANSFAC 18 motif. This class inherits from the Bio.motifs.Motif base class, as well 19 as from a Python dictionary. All motif information found by the parser 20 is stored as attributes of the base class when possible; see the 21 Bio.motifs.Motif base class for a description of these attributes. All 22 other information associated with the motif is stored as (key, value) 23 pairs in the dictionary, where the key is the two-letter fields as found 24 in the TRANSFAC file. References are an exception: These are stored in 25 the .references attribute. 26 27 These fields are commonly found in TRANSFAC files:: 28 29 AC: Accession number 30 AS: Accession numbers, secondary 31 BA: Statistical basis 32 BF: Binding factors 33 BS: Factor binding sites underlying the matrix 34 [sequence; SITE accession number; start position for matrix 35 sequence; length of sequence used; number of gaps inserted; 36 strand orientation.] 37 CC: Comments 38 CO: Copyright notice 39 DE: Short factor description 40 DR: External databases 41 [database name: database accession number] 42 DT: Date created/updated 43 HC: Subfamilies 44 HP: Superfamilies 45 ID: Identifier 46 NA: Name of the binding factor 47 OC: Taxonomic classification 48 OS: Species/Taxon 49 OV: Older version 50 PV: Preferred version 51 TY: Type 52 XX: Empty line; these are not stored in the Record. 53 54 References are stored in an .references attribute, which is a list of 55 dictionaries with the following keys:: 56 57 RN: Reference number 58 RA: Reference authors 59 RL: Reference data 60 RT: Reference title 61 RX: PubMed ID 62 63 For more information, see the TRANSFAC documentation. 64 """ 65 multiple_value_keys = set(['BF', 'OV', 'HP', 'BS', 'HC', 'DT', 'DR']) 66 # These keys can occur multiple times for one motif 67 68 reference_keys = set(['RX', 'RA', 'RT', 'RL'])
69 # These keys occur for references 70 71
72 -class Record(list):
73 """A Bio.motifs.transfac.Record stores the information in a TRANSFAC 74 matrix table. The record inherits from a list containing the individual 75 motifs. 76 77 Attributes: 78 o version: The version number, corresponding to the 'VV' field 79 in the TRANSFAC file; 80 """
81 - def __init__(self):
82 self.version = None
83
84 - def __str__(self):
85 return write(self)
86 87
88 -def read(handle):
89 """record = read(handle)""" 90 annotations = {} 91 references = [] 92 counts = None 93 record = Record() 94 for line in handle: 95 line = line.strip() 96 if not line: 97 continue 98 key_value = line.split(None, 1) 99 key = key_value[0].strip() 100 assert len(key) == 2, 'The key value of a TRANSFAC motif line should have 2 characters: "{0:s}"'.format(line) 101 if len(key_value) == 2: 102 value = key_value[1].strip() 103 if not line.partition(' ')[1]: 104 warnings.warn( 105 'A TRANSFAC motif line should have 2 spaces between key and value columns: "{0:s}"'.format(line), 106 BiopythonParserWarning 107 ) 108 if key == 'VV': 109 record.version = value 110 elif key in ('P0', 'PO'): # Old TRANSFAC files use PO instead of P0 111 counts = {} 112 assert value.split()[:4] == ['A', 'C', 'G', 'T'], \ 113 'A TRANSFAC matrix "{0:s}" line should be followed by "A C G T": "{0:s}"'.format(key, line) 114 length = 0 115 for c in "ACGT": 116 counts[c] = [] 117 for line in handle: 118 line = line.strip() 119 key_value = line.split(None, 1) 120 key = key_value[0].strip() 121 if len(key_value) == 2: 122 value = key_value[1].strip() 123 if not line.partition(' ')[1]: 124 warnings.warn( 125 'A TRANSFAC motif line should have 2 spaces between key and value columns: "{0:s}"'.format( 126 line), 127 BiopythonParserWarning) 128 try: 129 i = int(key) 130 except ValueError: 131 break 132 if length == 0 and i == 0: 133 warnings.warn( 134 'A TRANSFAC matrix should start with "01" as first row of the matrix, ' 135 'but this matrix uses "00": "{0:s}"'.format(line), 136 BiopythonParserWarning) 137 else: 138 length += 1 139 assert i == length, \ 140 'The TRANSFAC matrix row number does not match the position in the matrix: "{0:s}"'.format(line) 141 if len(key) == 1: 142 warnings.warn( 143 'A TRANSFAC matrix line should have a 2 digit key at the start of the lin ("{0:02d}"), ' 144 'but this matrix uses "{0:d}": "{1:s}".'.format(i, line), 145 BiopythonParserWarning) 146 assert len(key_value) == 2, 'A TRANSFAC matrix line should have a key and a value: "{0:s}"'.format(line) 147 values = value.split()[:4] 148 assert len(values) == 4, \ 149 'A TRANSFAC matrix line should have a value for each nucleotide (A, C, G and T): "{0:s}"'.format( 150 line) 151 for c, v in zip("ACGT", values): 152 counts[c].append(float(v)) 153 if line == 'XX': 154 pass 155 elif key == 'RN': 156 index, separator, accession = value.partition(";") 157 assert index[0] == '[', \ 158 'The index "{0:s}" in a TRANSFAC RN line should start with a "[": "{0:s}"'.format(index, line) 159 assert index[-1] == ']', \ 160 'The index "{0:s}" in a TRANSFAC RN line should end with a "]": "{0:s}"'.format(index, line) 161 index = int(index[1:-1]) 162 assert len(references) == index - 1, \ 163 'The index "{0:d}" of the TRANSFAC RN line does not match the current number ' \ 164 'of seen references "{1:d}": "{2:s}"'.format(index, len(reference) + 1, line) 165 reference = {key: value} 166 references.append(reference) 167 elif key == '//': 168 if counts is not None: 169 motif = Motif(alphabet=IUPAC.unambiguous_dna, counts=counts) 170 motif.update(annotations) 171 motif.references = references 172 record.append(motif) 173 annotations = {} 174 references = [] 175 elif key in Motif.reference_keys: 176 reference[key] = value 177 elif key in Motif.multiple_value_keys: 178 if key not in annotations: 179 annotations[key] = [] 180 annotations[key].append(value) 181 else: 182 annotations[key] = value 183 return record
184 185
186 -def write(motifs):
187 """Write the representation of a motif in TRANSFAC format 188 """ 189 blocks = [] 190 try: 191 version = motifs.version 192 except AttributeError: 193 pass 194 else: 195 if version is not None: 196 block = """\ 197 VV %s 198 XX 199 // 200 """ % version 201 blocks.append(block) 202 multiple_value_keys = Motif.multiple_value_keys 203 sections = (('AC', 'AS',), # Accession 204 ('ID',), # ID 205 ('DT', 'CO'), # Date, copyright 206 ('NA',), # Name 207 ('DE',), # Short factor description 208 ('TY',), # Type 209 ('OS', 'OC'), # Organism 210 ('HP', 'HC'), # Superfamilies, subfamilies 211 ('BF',), # Binding factors 212 ('P0',), # Frequency matrix 213 ('BA',), # Statistical basis 214 ('BS',), # Factor binding sites 215 ('CC',), # Comments 216 ('DR',), # External databases 217 ('OV', 'PV',), # Versions 218 ) 219 for motif in motifs: 220 lines = [] 221 for section in sections: 222 blank = False 223 for key in section: 224 if key == 'P0': 225 # Frequency matrix 226 length = motif.length 227 if length == 0: 228 continue 229 sequence = motif.degenerate_consensus 230 letters = sorted(motif.alphabet.letters) 231 line = " ".join(["P0"] + letters) 232 233 lines.append(line) 234 for i in range(length): 235 line = " ".join( 236 ["%02.d"] + 237 ["%6.20g" for l in letters]) + \ 238 " %s" 239 line = line % tuple( 240 [i + 1] + 241 [motif.counts[l][i] for l in letters] + 242 [sequence[i]] 243 ) 244 lines.append(line) 245 blank = True 246 else: 247 try: 248 value = motif.get(key) 249 except AttributeError: 250 value = None 251 if value is not None: 252 if key in multiple_value_keys: 253 for v in value: 254 line = "%s %s" % (key, v) 255 lines.append(line) 256 else: 257 line = "%s %s" % (key, value) 258 lines.append(line) 259 blank = True 260 if key == 'PV': 261 # References 262 try: 263 references = motif.references 264 except AttributeError: 265 pass 266 else: 267 keys = ("RN", "RX", "RA", "RT", "RL") 268 for reference in references: 269 for key in keys: 270 value = reference.get(key) 271 if value is None: 272 continue 273 line = "%s %s" % (key, value) 274 lines.append(line) 275 blank = True 276 if blank: 277 line = 'XX' 278 lines.append(line) 279 # Finished this motif; glue the lines together 280 line = "//" 281 lines.append(line) 282 block = "\n".join(lines) + "\n" 283 blocks.append(block) 284 # Finished all motifs; glue the blocks together 285 text = "".join(blocks) 286 return text
287