1
2
3
4
5
6
7
8 """Functions to calculate assorted sequence checksums."""
9
10
11
12
13 from binascii import crc32 as _crc32
14 from Bio._py3k import _as_bytes
15
16
18 """Returns the crc32 checksum for a sequence (string or Seq object)."""
19
20
21
22 try:
23
24 return _crc32(_as_bytes(str(seq)))
25 except AttributeError:
26
27 return _crc32(_as_bytes(seq))
28
29
31 _table_h = []
32 for i in range(256):
33 l = i
34 part_h = 0
35 for j in range(8):
36 rflag = l & 1
37 l >>= 1
38 if part_h & 1:
39 l |= (1L << 31)
40 part_h >>= 1L
41 if rflag:
42 part_h ^= 0xd8000000L
43 _table_h.append(part_h)
44 return _table_h
45
46
47 _table_h = _init_table_h()
48
49
51 """Returns the crc64 checksum for a sequence (string or Seq object)."""
52 crcl = 0
53 crch = 0
54 for c in s:
55 shr = (crch & 0xFF) << 24
56 temp1h = crch >> 8
57 temp1l = (crcl >> 8) | shr
58 idx = (crcl ^ ord(c)) & 0xFF
59 crch = temp1h ^ _table_h[idx]
60 crcl = temp1l
61
62 return "CRC-%08X%08X" % (crch, crcl)
63
64
66 """Returns the GCG checksum (int) for a sequence (string or Seq object).
67
68 Given a nucleotide or amino-acid secuence (or any string),
69 returns the GCG checksum (int). Checksum used by GCG program.
70 seq type = str.
71 Based on BioPerl GCG_checksum. Adapted by Sebastian Bassi
72 with the help of John Lenton, Pablo Ziliani, and Gabriel Genellina.
73 All sequences are converted to uppercase """
74 try:
75
76 seq = str(seq)
77 except AttributeError:
78
79 pass
80 index = checksum = 0
81 for char in seq:
82 index += 1
83 checksum += index * ord(char.upper())
84 if index == 57:
85 index = 0
86 return checksum % 10000
87
88
90 """Returns the SEGUID (string) for a sequence (string or Seq object).
91
92 Given a nucleotide or amino-acid secuence (or any string),
93 returns the SEGUID string (A SEquence Globally Unique IDentifier).
94 seq type = str.
95 For more information about SEGUID, see:
96 http://bioinformatics.anl.gov/seguid/
97 DOI: 10.1002/pmic.200600032 """
98 import hashlib
99 import base64
100 m = hashlib.sha1()
101 try:
102
103 seq = str(seq)
104 except AttributeError:
105
106 pass
107 m.update(_as_bytes(seq.upper()))
108 try:
109
110 return base64.encodebytes(m.digest()).decode().replace("\n", "").rstrip("=")
111 except AttributeError:
112 pass
113
114 return base64.b64encode(m.digest()).rstrip("=")
115
116
117 if __name__ == "__main__":
118 print "Quick self test"
119
120 str_light_chain_one = "QSALTQPASVSGSPGQSITISCTGTSSDVGSYNLVSWYQQHPGK" \
121 + "APKLMIYEGSKRPSGVSNRFSGSKSGNTASLTISGLQAEDEADY" \
122 + "YCSSYAGSSTLVFGGGTKLTVL"
123
124 str_light_chain_two = "QSALTQPASVSGSPGQSITISCTGTSSDVGSYNLVSWYQQHPGK" \
125 + "APKLMIYEGSKRPSGVSNRFSGSKSGNTASLTISGLQAEDEADY" \
126 + "YCCSYAGSSTWVFGGGTKLTVL"
127
128 assert crc64(str_light_chain_one) == crc64(str_light_chain_two)
129 assert 'CRC-44CAAD88706CC153' == crc64(str_light_chain_one)
130
131 assert 'BpBeDdcNUYNsdk46JoJdw7Pd3BI' == seguid(str_light_chain_one)
132 assert 'X5XEaayob1nZLOc7eVT9qyczarY' == seguid(str_light_chain_two)
133
134 print "Done"
135