Package Bio :: Package SeqUtils :: Module CheckSum
[hide private]
[frames] | no frames]

Source Code for Module Bio.SeqUtils.CheckSum

  1  # Copyright 2002 by Yves Bastide and Brad Chapman. 
  2  # Copyright 2007 by Sebastian Bassi 
  3  # All rights reserved. 
  4  # This code is part of the Biopython distribution and governed by its 
  5  # license.  Please see the LICENSE file that should have been included 
  6  # as part of this package. 
  7   
  8  """Functions to calculate assorted sequence checksums.""" 
  9   
 10  # crc32, crc64, gcg, and seguid 
 11  # crc64 is adapted from BioPerl 
 12   
 13  from __future__ import print_function 
 14   
 15  from binascii import crc32 as _crc32 
 16  from Bio._py3k import _as_bytes 
 17   
 18   
19 -def crc32(seq):
20 """Returns the crc32 checksum for a sequence (string or Seq object). 21 22 Note that the case is important: 23 24 >>> crc32("ACGTACGTACGT") 25 20049947 26 >>> crc32("acgtACGTacgt") 27 1688586483 28 29 """ 30 # NOTE - On Python 2 returns a signed int, on Python 3 it is unsigned 31 # Docs suggest should use crc32(x) & 0xffffffff for consistency. 32 # TODO - Should we return crc32(x) & 0xffffffff here? 33 try: 34 # Assume its a Seq object 35 return _crc32(_as_bytes(str(seq))) 36 except AttributeError: 37 # Assume its a string/unicode 38 return _crc32(_as_bytes(seq))
39 40
41 -def _init_table_h():
42 _table_h = [] 43 for i in range(256): 44 l = i 45 part_h = 0 46 for j in range(8): 47 rflag = l & 1 48 l >>= 1 49 if part_h & 1: 50 l |= (1 << 31) 51 part_h >>= 1 52 if rflag: 53 part_h ^= 0xd8000000 54 _table_h.append(part_h) 55 return _table_h
56 57 # Initialisation 58 _table_h = _init_table_h() 59 60
61 -def crc64(s):
62 """Returns the crc64 checksum for a sequence (string or Seq object). 63 64 Note that the case is important: 65 66 >>> crc64("ACGTACGTACGT") 67 'CRC-C4FBB762C4A87EBD' 68 >>> crc64("acgtACGTacgt") 69 'CRC-DA4509DC64A87EBD' 70 71 """ 72 crcl = 0 73 crch = 0 74 for c in s: 75 shr = (crch & 0xFF) << 24 76 temp1h = crch >> 8 77 temp1l = (crcl >> 8) | shr 78 idx = (crcl ^ ord(c)) & 0xFF 79 crch = temp1h ^ _table_h[idx] 80 crcl = temp1l 81 82 return "CRC-%08X%08X" % (crch, crcl)
83 84
85 -def gcg(seq):
86 """Returns the GCG checksum (int) for a sequence (string or Seq object). 87 88 Given a nucleotide or amino-acid secuence (or any string), 89 returns the GCG checksum (int). Checksum used by GCG program. 90 seq type = str. 91 92 Based on BioPerl GCG_checksum. Adapted by Sebastian Bassi 93 with the help of John Lenton, Pablo Ziliani, and Gabriel Genellina. 94 95 All sequences are converted to uppercase. 96 97 >>> gcg("ACGTACGTACGT") 98 5688 99 >>> gcg("acgtACGTacgt") 100 5688 101 102 """ 103 try: 104 # Assume its a Seq object 105 seq = str(seq) 106 except AttributeError: 107 # Assume its a string 108 pass 109 index = checksum = 0 110 for char in seq: 111 index += 1 112 checksum += index * ord(char.upper()) 113 if index == 57: 114 index = 0 115 return checksum % 10000
116 117
118 -def seguid(seq):
119 """Returns the SEGUID (string) for a sequence (string or Seq object). 120 121 Given a nucleotide or amino-acid secuence (or any string), 122 returns the SEGUID string (A SEquence Globally Unique IDentifier). 123 seq type = str. 124 125 Note that the case is not important: 126 127 >>> seguid("ACGTACGTACGT") 128 'If6HIvcnRSQDVNiAoefAzySc6i4' 129 >>> seguid("acgtACGTacgt") 130 'If6HIvcnRSQDVNiAoefAzySc6i4' 131 132 For more information about SEGUID, see: 133 http://bioinformatics.anl.gov/seguid/ 134 DOI: 10.1002/pmic.200600032 135 """ 136 import hashlib 137 import base64 138 m = hashlib.sha1() 139 try: 140 # Assume it's a Seq object 141 seq = str(seq) 142 except AttributeError: 143 # Assume it's a string 144 pass 145 m.update(_as_bytes(seq.upper())) 146 try: 147 # For Python 3+ 148 return base64.encodebytes(m.digest()).decode().replace("\n", "").rstrip("=") 149 except AttributeError: 150 pass 151 # For all other Pythons 152 return base64.b64encode(m.digest()).rstrip("=")
153 154 155 if __name__ == "__main__": 156 from Bio._utils import run_doctest 157 run_doctest() 158