Package Bio :: Package motifs :: Module transfac
[hide private]
[frames] | no frames]

Source Code for Module Bio.motifs.transfac

  1  # Copyright 2003 by Bartek Wilczynski.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5   
  6  """Parsing TRANSFAC files.""" 
  7   
  8  import warnings 
  9   
 10  from Bio import BiopythonParserWarning 
 11  from Bio import motifs 
 12  from Bio.Alphabet import IUPAC 
 13   
 14   
15 -class Motif(motifs.Motif, dict):
16 """Store the information for one TRANSFAC motif. 17 18 This class inherits from the Bio.motifs.Motif base class, as well 19 as from a Python dictionary. All motif information found by the parser 20 is stored as attributes of the base class when possible; see the 21 Bio.motifs.Motif base class for a description of these attributes. All 22 other information associated with the motif is stored as (key, value) 23 pairs in the dictionary, where the key is the two-letter fields as found 24 in the TRANSFAC file. References are an exception: These are stored in 25 the .references attribute. 26 27 These fields are commonly found in TRANSFAC files:: 28 29 AC: Accession number 30 AS: Accession numbers, secondary 31 BA: Statistical basis 32 BF: Binding factors 33 BS: Factor binding sites underlying the matrix 34 [sequence; SITE accession number; start position for matrix 35 sequence; length of sequence used; number of gaps inserted; 36 strand orientation.] 37 CC: Comments 38 CO: Copyright notice 39 DE: Short factor description 40 DR: External databases 41 [database name: database accession number] 42 DT: Date created/updated 43 HC: Subfamilies 44 HP: Superfamilies 45 ID: Identifier 46 NA: Name of the binding factor 47 OC: Taxonomic classification 48 OS: Species/Taxon 49 OV: Older version 50 PV: Preferred version 51 TY: Type 52 XX: Empty line; these are not stored in the Record. 53 54 References are stored in an .references attribute, which is a list of 55 dictionaries with the following keys:: 56 57 RN: Reference number 58 RA: Reference authors 59 RL: Reference data 60 RT: Reference title 61 RX: PubMed ID 62 63 For more information, see the TRANSFAC documentation. 64 """ 65 66 multiple_value_keys = set(['BF', 'OV', 'HP', 'BS', 'HC', 'DT', 'DR']) 67 # These keys can occur multiple times for one motif 68 69 reference_keys = set(['RX', 'RA', 'RT', 'RL'])
70 # These keys occur for references 71 72
73 -class Record(list):
74 """Store the information in a TRANSFAC matrix table. 75 76 The record inherits from a list containing the individual motifs. 77 78 Attributes: 79 - version - The version number, corresponding to the 'VV' field 80 in the TRANSFAC file; 81 82 """ 83
84 - def __init__(self):
85 self.version = None
86
87 - def __str__(self):
88 return write(self)
89 90
91 -def read(handle):
92 """Record = read(handle)""" 93 annotations = {} 94 references = [] 95 counts = None 96 record = Record() 97 for line in handle: 98 line = line.strip() 99 if not line: 100 continue 101 key_value = line.split(None, 1) 102 key = key_value[0].strip() 103 assert len(key) == 2, 'The key value of a TRANSFAC motif line should have 2 characters: "{0:s}"'.format(line) 104 if len(key_value) == 2: 105 value = key_value[1].strip() 106 if not line.partition(' ')[1]: 107 warnings.warn( 108 'A TRANSFAC motif line should have 2 spaces between key and value columns: "{0:s}"'.format(line), 109 BiopythonParserWarning 110 ) 111 if key == 'VV': 112 record.version = value 113 elif key in ('P0', 'PO'): # Old TRANSFAC files use PO instead of P0 114 counts = {} 115 assert value.split()[:4] == ['A', 'C', 'G', 'T'], \ 116 'A TRANSFAC matrix "{0:s}" line should be followed by "A C G T": "{0:s}"'.format(key, line) 117 length = 0 118 for c in "ACGT": 119 counts[c] = [] 120 for line in handle: 121 line = line.strip() 122 key_value = line.split(None, 1) 123 key = key_value[0].strip() 124 if len(key_value) == 2: 125 value = key_value[1].strip() 126 if not line.partition(' ')[1]: 127 warnings.warn( 128 'A TRANSFAC motif line should have 2 spaces between key and value columns: "{0:s}"'.format( 129 line), 130 BiopythonParserWarning) 131 try: 132 i = int(key) 133 except ValueError: 134 break 135 if length == 0 and i == 0: 136 warnings.warn( 137 'A TRANSFAC matrix should start with "01" as first row of the matrix, ' 138 'but this matrix uses "00": "{0:s}"'.format(line), 139 BiopythonParserWarning) 140 else: 141 length += 1 142 assert i == length, \ 143 'The TRANSFAC matrix row number does not match the position in the matrix: "{0:s}"'.format(line) 144 if len(key) == 1: 145 warnings.warn( 146 'A TRANSFAC matrix line should have a 2 digit key at the start of the lin ("{0:02d}"), ' 147 'but this matrix uses "{0:d}": "{1:s}".'.format(i, line), 148 BiopythonParserWarning) 149 assert len(key_value) == 2, 'A TRANSFAC matrix line should have a key and a value: "{0:s}"'.format(line) 150 values = value.split()[:4] 151 assert len(values) == 4, \ 152 'A TRANSFAC matrix line should have a value for each nucleotide (A, C, G and T): "{0:s}"'.format( 153 line) 154 for c, v in zip("ACGT", values): 155 counts[c].append(float(v)) 156 if line == 'XX': 157 pass 158 elif key == 'RN': 159 index, separator, accession = value.partition(";") 160 assert index[0] == '[', \ 161 'The index "{0:s}" in a TRANSFAC RN line should start with a "[": "{0:s}"'.format(index, line) 162 assert index[-1] == ']', \ 163 'The index "{0:s}" in a TRANSFAC RN line should end with a "]": "{0:s}"'.format(index, line) 164 index = int(index[1:-1]) 165 assert len(references) == index - 1, \ 166 'The index "{0:d}" of the TRANSFAC RN line does not match the current number ' \ 167 'of seen references "{1:d}": "{2:s}"'.format(index, len(references) + 1, line) 168 reference = {key: value} 169 references.append(reference) 170 elif key == '//': 171 if counts is not None: 172 motif = Motif(alphabet=IUPAC.unambiguous_dna, counts=counts) 173 motif.update(annotations) 174 motif.references = references 175 record.append(motif) 176 annotations = {} 177 references = [] 178 elif key in Motif.reference_keys: 179 reference[key] = value 180 elif key in Motif.multiple_value_keys: 181 if key not in annotations: 182 annotations[key] = [] 183 annotations[key].append(value) 184 else: 185 annotations[key] = value 186 return record
187 188
189 -def write(motifs):
190 """Write the representation of a motif in TRANSFAC format.""" 191 blocks = [] 192 try: 193 version = motifs.version 194 except AttributeError: 195 pass 196 else: 197 if version is not None: 198 block = """\ 199 VV %s 200 XX 201 // 202 """ % version 203 blocks.append(block) 204 multiple_value_keys = Motif.multiple_value_keys 205 sections = (('AC', 'AS',), # Accession 206 ('ID',), # ID 207 ('DT', 'CO'), # Date, copyright 208 ('NA',), # Name 209 ('DE',), # Short factor description 210 ('TY',), # Type 211 ('OS', 'OC'), # Organism 212 ('HP', 'HC'), # Superfamilies, subfamilies 213 ('BF',), # Binding factors 214 ('P0',), # Frequency matrix 215 ('BA',), # Statistical basis 216 ('BS',), # Factor binding sites 217 ('CC',), # Comments 218 ('DR',), # External databases 219 ('OV', 'PV',), # Versions 220 ) 221 for motif in motifs: 222 lines = [] 223 for section in sections: 224 blank = False 225 for key in section: 226 if key == 'P0': 227 # Frequency matrix 228 length = motif.length 229 if length == 0: 230 continue 231 sequence = motif.degenerate_consensus 232 letters = sorted(motif.alphabet.letters) 233 line = " ".join(["P0"] + letters) 234 235 lines.append(line) 236 for i in range(length): 237 line = " ".join( 238 ["%02.d"] + 239 ["%6.20g" for l in letters]) + \ 240 " %s" 241 line = line % tuple( 242 [i + 1] + 243 [motif.counts[l][i] for l in letters] + 244 [sequence[i]] 245 ) 246 lines.append(line) 247 blank = True 248 else: 249 try: 250 value = motif.get(key) 251 except AttributeError: 252 value = None 253 if value is not None: 254 if key in multiple_value_keys: 255 for v in value: 256 line = "%s %s" % (key, v) 257 lines.append(line) 258 else: 259 line = "%s %s" % (key, value) 260 lines.append(line) 261 blank = True 262 if key == 'PV': 263 # References 264 try: 265 references = motif.references 266 except AttributeError: 267 pass 268 else: 269 keys = ("RN", "RX", "RA", "RT", "RL") 270 for reference in references: 271 for key in keys: 272 value = reference.get(key) 273 if value is None: 274 continue 275 line = "%s %s" % (key, value) 276 lines.append(line) 277 blank = True 278 if blank: 279 line = 'XX' 280 lines.append(line) 281 # Finished this motif; glue the lines together 282 line = "//" 283 lines.append(line) 284 block = "\n".join(lines) + "\n" 285 blocks.append(block) 286 # Finished all motifs; glue the blocks together 287 text = "".join(blocks) 288 return text
289