1 """Hold GenBank data in a straightforward format.
2
3 classes:
4 o Record - All of the information in a GenBank record.
5 o Reference - hold reference data for a record.
6 o Feature - Hold the information in a Feature Table.
7 o Qualifier - Qualifiers on a Feature.
8 17-MAR-2009: added support for WGS and WGS_SCAFLD lines. Ying Huang & Iddo Friedberg
9 """
10
11 import Bio.GenBank
12
13
15 """Write a line of GenBank info that can wrap over multiple lines.
16
17 This takes a line of information which can potentially wrap over
18 multiple lines, and breaks it up with carriage returns and
19 indentation so it fits properly into a GenBank record.
20
21 Arguments:
22
23 o information - The string holding the information we want
24 wrapped in GenBank method.
25
26 o indent - The indentation on the lines we are writing.
27
28 o wrap_space - Whether or not to wrap only on spaces in the
29 information.
30
31 o split_char - A specific character to split the lines on. By default
32 spaces are used.
33 """
34 info_length = Record.GB_LINE_LENGTH - indent
35
36 if not information:
37
38 return ".\n"
39
40 if wrap_space:
41 info_parts = information.split(split_char)
42 else:
43 cur_pos = 0
44 info_parts = []
45 while cur_pos < len(information):
46 info_parts.append(information[cur_pos: cur_pos + info_length])
47 cur_pos += info_length
48
49
50 output_parts = []
51 cur_part = ""
52 for info_part in info_parts:
53 if len(cur_part) + 1 + len(info_part) > info_length:
54 if cur_part:
55 if split_char != " ":
56 cur_part += split_char
57 output_parts.append(cur_part)
58 cur_part = info_part
59 else:
60 if cur_part == "":
61 cur_part = info_part
62 else:
63 cur_part += split_char + info_part
64
65
66 if cur_part:
67 output_parts.append(cur_part)
68
69
70 output_info = output_parts[0] + "\n"
71 for output_part in output_parts[1:]:
72 output_info += " " * indent + output_part + "\n"
73
74 return output_info
75
76
78 """Write out information with the specified indent.
79
80 Unlike _wrapped_genbank, this function makes no attempt to wrap
81 lines -- it assumes that the information already has newlines in the
82 appropriate places, and will add the specified indent to the start of
83 each line.
84 """
85
86 info_parts = information.split("\n")
87
88
89 output_info = info_parts[0] + "\n"
90 for info_part in info_parts[1:]:
91 output_info += " " * indent + info_part + "\n"
92
93 return output_info
94
95
97 """Hold GenBank information in a format similar to the original record.
98
99 The Record class is meant to make data easy to get to when you are
100 just interested in looking at GenBank data.
101
102 Attributes:
103 o locus - The name specified after the LOCUS keyword in the GenBank
104 record. This may be the accession number, or a clone id or something else.
105 o size - The size of the record.
106 o residue_type - The type of residues making up the sequence in this
107 record. Normally something like RNA, DNA or PROTEIN, but may be as
108 esoteric as 'ss-RNA circular'.
109 o data_file_division - The division this record is stored under in
110 GenBank (ie. PLN -> plants; PRI -> humans, primates; BCT -> bacteria...)
111 o date - The date of submission of the record, in a form like '28-JUL-1998'
112 o accession - list of all accession numbers for the sequence.
113 o nid - Nucleotide identifier number.
114 o pid - Proteint identifier number
115 o version - The accession number + version (ie. AB01234.2)
116 o db_source - Information about the database the record came from
117 o gi - The NCBI gi identifier for the record.
118 o keywords - A list of keywords related to the record.
119 o segment - If the record is one of a series, this is info about which
120 segment this record is (something like '1 of 6').
121 o source - The source of material where the sequence came from.
122 o organism - The genus and species of the organism (ie. 'Homo sapiens')
123 o taxonomy - A listing of the taxonomic classification of the organism,
124 starting general and getting more specific.
125 o references - A list of Reference objects.
126 o comment - Text with any kind of comment about the record.
127 o features - A listing of Features making up the feature table.
128 o base_counts - A string with the counts of bases for the sequence.
129 o origin - A string specifying info about the origin of the sequence.
130 o sequence - A string with the sequence itself.
131 o contig - A string of location information for a CONTIG in a RefSeq file
132 o project - The genome sequencing project numbers
133 (will be replaced by the dblink cross-references in 2009).
134 o dblinks - The genome sequencing project number(s) and other links.
135 (will replace the project information in 2009).
136 """
137
138 GB_LINE_LENGTH = 79
139 GB_BASE_INDENT = 12
140 GB_FEATURE_INDENT = 21
141 GB_INTERNAL_INDENT = 2
142 GB_OTHER_INTERNAL_INDENT = 3
143 GB_FEATURE_INTERNAL_INDENT = 5
144 GB_SEQUENCE_INDENT = 9
145
146 BASE_FORMAT = "%-" + str(GB_BASE_INDENT) + "s"
147 INTERNAL_FORMAT = " " * GB_INTERNAL_INDENT + "%-" + \
148 str(GB_BASE_INDENT - GB_INTERNAL_INDENT) + "s"
149 OTHER_INTERNAL_FORMAT = " " * GB_OTHER_INTERNAL_INDENT + "%-" + \
150 str(GB_BASE_INDENT - GB_OTHER_INTERNAL_INDENT) + \
151 "s"
152
153 BASE_FEATURE_FORMAT = "%-" + str(GB_FEATURE_INDENT) + "s"
154 INTERNAL_FEATURE_FORMAT = " " * GB_FEATURE_INTERNAL_INDENT + "%-" + \
155 str(GB_FEATURE_INDENT -
156 GB_FEATURE_INTERNAL_INDENT) + "s"
157 SEQUENCE_FORMAT = "%" + str(GB_SEQUENCE_INDENT) + "s"
158
189
230
232 """Provide the output string for the LOCUS line.
233 """
234 output = "LOCUS"
235 output += " " * 7
236 output += "%-9s" % self.locus
237 output += " "
238 output += "%7s" % self.size
239 if "PROTEIN" in self.residue_type:
240 output += " aa"
241 else:
242 output += " bp "
243
244
245
246 if "circular" in self.residue_type:
247 output += "%17s" % self.residue_type
248
249 elif "-" in self.residue_type:
250 output += "%7s" % self.residue_type
251 output += " " * 10
252 else:
253 output += " " * 3
254 output += "%-4s" % self.residue_type
255 output += " " * 10
256
257 output += " " * 2
258 output += "%3s" % self.data_file_division
259 output += " " * 7
260 output += "%11s" % self.date
261 output += "\n"
262 return output
263
270
287
289 """Output for the VERSION line.
290 """
291 if self.version:
292 output = Record.BASE_FORMAT % "VERSION"
293 output += self.version
294 output += " GI:"
295 output += "%s\n" % self.gi
296 else:
297 output = ""
298 return output
299
301 output = ""
302 if len(self.projects) > 0:
303 output = Record.BASE_FORMAT % "PROJECT"
304 output += "%s\n" % " ".join(self.projects)
305 return output
306
314
316 """Output for the NID line. Use of NID is obsolete in GenBank files.
317 """
318 if self.nid:
319 output = Record.BASE_FORMAT % "NID"
320 output += "%s\n" % self.nid
321 else:
322 output = ""
323 return output
324
326 """Output for PID line. Presumedly, PID usage is also obsolete.
327 """
328 if self.pid:
329 output = Record.BASE_FORMAT % "PID"
330 output += "%s\n" % self.pid
331 else:
332 output = ""
333 return output
334
336 """Output for the KEYWORDS line.
337 """
338 output = ""
339 if len(self.keywords) >= 0:
340 output += Record.BASE_FORMAT % "KEYWORDS"
341 keyword_info = ""
342 for keyword in self.keywords:
343 keyword_info += "%s; " % keyword
344
345 keyword_info = keyword_info[:-2]
346 keyword_info += "."
347
348 output += _wrapped_genbank(keyword_info,
349 Record.GB_BASE_INDENT)
350
351 return output
352
354 """Output for DBSOURCE line.
355 """
356 if self.db_source:
357 output = Record.BASE_FORMAT % "DBSOURCE"
358 output += "%s\n" % self.db_source
359 else:
360 output = ""
361 return output
362
371
378
395
405
407 """Output for the FEATURES line.
408 """
409 output = ""
410 if len(self.features) > 0:
411 output += Record.BASE_FEATURE_FORMAT % "FEATURES"
412 output += "Location/Qualifiers\n"
413 return output
414
416 """Output for the BASE COUNT line with base information.
417 """
418 output = ""
419 if self.base_counts:
420 output += Record.BASE_FORMAT % "BASE COUNT "
421
422 count_parts = self.base_counts.split(" ")
423 while '' in count_parts:
424 count_parts.remove('')
425
426
427 if len(count_parts) % 2 == 0:
428 while len(count_parts) > 0:
429 count_info = count_parts.pop(0)
430 count_type = count_parts.pop(0)
431
432 output += "%7s %s" % (count_info, count_type)
433
434
435
436 else:
437 output += self.base_counts
438 output += "\n"
439 return output
440
454
456 """Output for all of the sequence.
457 """
458 output = ""
459 if self.sequence:
460 cur_seq_pos = 0
461 while cur_seq_pos < len(self.sequence):
462 output += Record.SEQUENCE_FORMAT % str(cur_seq_pos + 1)
463
464 for section in range(6):
465 start_pos = cur_seq_pos + section * 10
466 end_pos = start_pos + 10
467 seq_section = self.sequence[start_pos:end_pos]
468 output += " %s" % seq_section.lower()
469
470
471 if end_pos > len(self.sequence):
472 break
473
474 output += "\n"
475 cur_seq_pos += 60
476 return output
477
484
486 output = ""
487 if self.wgs_scafld:
488 output += Record.BASE_FORMAT % "WGS_SCAFLD"
489 output += self.wgs_scafld
490 return output
491
501
502
504 """Hold information from a GenBank reference.
505
506 Attributes:
507 o number - The number of the reference in the listing of references.
508 o bases - The bases in the sequence the reference refers to.
509 o authors - String with all of the authors.
510 o consrtm - Consortium the authors belong to.
511 o title - The title of the reference.
512 o journal - Information about the journal where the reference appeared.
513 o medline_id - The medline id for the reference.
514 o pubmed_id - The pubmed_id for the reference.
515 o remark - Free-form remarks about the reference.
516 """
527
539
541 """Output for REFERENCE lines.
542 """
543 output = Record.BASE_FORMAT % "REFERENCE"
544 if self.number:
545 if self.bases:
546 output += "%-3s" % self.number
547 output += "%s" % self.bases
548 else:
549 output += "%s" % self.number
550
551 output += "\n"
552 return output
553
562
571
580
589
598
607
616
617
619 """Hold information about a Feature in the Feature Table of GenBank record.
620
621 Attributes:
622 o key - The key name of the featue (ie. source)
623 o location - The string specifying the location of the feature.
624 o qualfiers - A listing Qualifier objects in the feature.
625 """
627 self.key = ''
628 self.location = ''
629 self.qualifiers = []
630
648
649
651 """Hold information about a qualifier in a GenBank feature.
652
653 Attributes:
654 o key - The key name of the qualifier (ie. /organism=)
655 o value - The value of the qualifier ("Dictyostelium discoideum").
656 """
658 self.key = ''
659 self.value = ''
660