1
2
3
4
5
6 """Parsing TRANSFAC files
7 """
8
9 from Bio import motifs
10 from Bio.Alphabet import IUPAC
11
12
13 -class Motif(motifs.Motif, dict):
14 """A Bio.motifs.transfac.Motif stores the information in one TRANSFAC
15 motif. This class inherits from the Bio.motifs.Motif base class, as well
16 as from a Python dictionary. All motif information found by the parser
17 is stored as attributes of the base class when possible; see the
18 Bio.motifs.Motif base class for a description of these attributes. All
19 other information associated with the motif is stored as (key, value)
20 pairs in the dictionary, where the key is the two-letter fields as found
21 in the TRANSFAC file. References are an exception: These are stored in
22 the .references attribute.
23
24 These fields are commonly found in TRANSFAC files:
25 AC: Accession number
26 AS: Accession numbers, secondary
27 BA: Statistical basis
28 BF: Binding factors
29 BS: Factor binding sites underlying the matrix
30 [sequence; SITE accession number; start position for matrix
31 sequence; length of sequence used; number of gaps inserted;
32 strand orientation.]
33 CC: Comments
34 CO: Copyright notice
35 DE: Short factor description
36 DR: External databases
37 [database name: database accession number]
38 DT: Date created/updated
39 HC: Subfamilies
40 HP: Superfamilies
41 ID: Identifier
42 NA: Name of the binding factor
43 OC: Taxonomic classification
44 OS: Species/Taxon
45 OV: Older version
46 PV: Preferred version
47 TY: Type
48 XX: Empty line; these are not stored in the Record.
49
50 References are stored in an .references attribute, which is a list of
51 dictionaries with the following keys:
52 RN: Reference number
53 RA: Reference authors
54 RL: Reference data
55 RT: Reference title
56 RX: PubMed ID
57
58 For more information, see the TRANSFAC documentation.
59 """
60 multiple_value_keys = set(['BF', 'OV', 'HP', 'BS', 'HC', 'DT', 'DR'])
61
62
63 reference_keys = set(['RX', 'RA', 'RT', 'RL'])
64
65
66
68 """A Bio.motifs.transfac.Record stores the information in a TRANSFAC
69 matrix table. The record inherits from a list containing the individual
70 motifs.
71
72 Attributes:
73 o version: The version number, corresponding to the 'VV' field
74 in the TRANSFAC file;
75 """
78
81
82
84 """record = read(handle)"""
85 annotations = {}
86 references = []
87 counts = None
88 record = Record()
89 for line in handle:
90 line = line.strip()
91 key, value = line[:2], line[4:]
92 if key=='VV':
93 record.version = value
94 elif key in ('P0', 'PO'):
95 counts = {}
96 assert value.split()[:4]==['A','C','G','T']
97 length = 0
98 for c in "ACGT":
99 counts[c] = []
100 for line in handle:
101 key, value = line[:2], line[4:]
102 try:
103 i = int(key)
104 except ValueError:
105 break
106 length+=1
107 assert i==length
108 values = value.split()
109 for c, v in zip("ACGT", values):
110 counts[c].append(float(v))
111 if line=='XX':
112 pass
113 elif key=='RN':
114 index, separator, accession = value.partition(";")
115 assert index[0]=='['
116 assert index[-1]==']'
117 index = int(index[1:-1])
118 assert len(references)==index-1
119 reference = {key: value}
120 references.append(reference)
121 elif key=='//':
122 if counts is not None:
123 motif = Motif(alphabet=IUPAC.unambiguous_dna, counts=counts)
124 motif.update(annotations)
125 motif.references = references
126 record.append(motif)
127 annotations = {}
128 references = []
129 elif key in Motif.reference_keys:
130 reference[key] = value
131 elif key in Motif.multiple_value_keys:
132 if not key in annotations:
133 annotations[key] = []
134 annotations[key].append(value)
135 else:
136 annotations[key] = value
137 return record
138
140 """Write the representation of a motif in TRANSFAC format
141 """
142 blocks = []
143 try:
144 version = motifs.version
145 except AttributeError:
146 pass
147 else:
148 if version is not None:
149 block = """\
150 VV %s
151 XX
152 //
153 """ % version
154 blocks.append(block)
155 multiple_value_keys = Motif.multiple_value_keys
156 sections = (('AC', 'AS',),
157 ('ID',),
158 ('DT', 'CO'),
159 ('NA',),
160 ('DE',),
161 ('TY',),
162 ('OS', 'OC'),
163 ('HP', 'HC'),
164 ('BF',),
165 ('P0',),
166 ('BA',),
167 ('BS',),
168 ('CC',),
169 ('DR',),
170 ('OV', 'PV',),
171 )
172 for motif in motifs:
173 lines = []
174 for section in sections:
175 blank = False
176 for key in section:
177 if key=='P0':
178
179 length = motif.length
180 if length==0:
181 continue
182 sequence = motif.degenerate_consensus
183 line = "P0 A C G T"
184 lines.append(line)
185 for i in range(length):
186 line = "%02.d %6.20g %6.20g %6.20g %6.20g %s" % (
187 i+1,
188 motif.counts['A'][i],
189 motif.counts['C'][i],
190 motif.counts['G'][i],
191 motif.counts['T'][i],
192 sequence[i],
193 )
194 lines.append(line)
195 blank = True
196 else:
197 try:
198 value = motif.get(key)
199 except AttributeError:
200 value = None
201 if value is not None:
202 if key in multiple_value_keys:
203 for v in value:
204 line = "%s %s" % (key, v)
205 lines.append(line)
206 else:
207 line = "%s %s" % (key, value)
208 lines.append(line)
209 blank = True
210 if key=='PV':
211
212 try:
213 references = motif.references
214 except AttributeError:
215 pass
216 else:
217 keys = ("RN", "RX", "RA", "RT", "RL")
218 for reference in references:
219 for key in keys:
220 value = reference.get(key)
221 if value is None:
222 continue
223 line = "%s %s" % (key, value)
224 lines.append(line)
225 blank = True
226 if blank:
227 line = 'XX'
228 lines.append(line)
229
230 line = "//"
231 lines.append(line)
232 block = "\n".join(lines) + "\n"
233 blocks.append(block)
234
235 text = "".join(blocks)
236 return text
237