Package Bio :: Package SeqUtils :: Module CheckSum
[hide private]
[frames] | no frames]

Source Code for Module Bio.SeqUtils.CheckSum

  1  # Copyright 2002 by Yves Bastide and Brad Chapman. 
  2  # Copyright 2007 by Sebastian Bassi 
  3  # All rights reserved. 
  4  # This code is part of the Biopython distribution and governed by its 
  5  # license.  Please see the LICENSE file that should have been included 
  6  # as part of this package. 
  7   
  8  """Functions to calculate assorted sequence checksums.""" 
  9   
 10  # crc32, crc64, gcg, and seguid 
 11  # crc64 is adapted from BioPerl 
 12   
 13  from __future__ import print_function 
 14   
 15  from binascii import crc32 as _crc32 
 16  from Bio._py3k import _as_bytes 
 17   
 18   
19 -def crc32(seq):
20 """Returns the crc32 checksum for a sequence (string or Seq object). 21 22 Note that the case is important: 23 24 >>> crc32("ACGTACGTACGT") 25 20049947 26 >>> crc32("acgtACGTacgt") 27 1688586483 28 29 """ 30 # NOTE - On Python 2 returns a signed int, on Python 3 it is unsigned 31 # Docs suggest should use crc32(x) & 0xffffffff for consistency. 32 # TODO - Should we return crc32(x) & 0xffffffff here? 33 try: 34 # Assume its a Seq object 35 return _crc32(_as_bytes(str(seq))) 36 except AttributeError: 37 # Assume its a string/unicode 38 return _crc32(_as_bytes(seq))
39 40
41 -def _init_table_h():
42 _table_h = [] 43 for i in range(256): 44 l = i 45 part_h = 0 46 for j in range(8): 47 rflag = l & 1 48 l >>= 1 49 if part_h & 1: 50 l |= (1 << 31) 51 part_h >>= 1 52 if rflag: 53 part_h ^= 0xd8000000 54 _table_h.append(part_h) 55 return _table_h
56 57 58 # Initialisation 59 _table_h = _init_table_h() 60 61
62 -def crc64(s):
63 """Returns the crc64 checksum for a sequence (string or Seq object). 64 65 Note that the case is important: 66 67 >>> crc64("ACGTACGTACGT") 68 'CRC-C4FBB762C4A87EBD' 69 >>> crc64("acgtACGTacgt") 70 'CRC-DA4509DC64A87EBD' 71 72 """ 73 crcl = 0 74 crch = 0 75 for c in s: 76 shr = (crch & 0xFF) << 24 77 temp1h = crch >> 8 78 temp1l = (crcl >> 8) | shr 79 idx = (crcl ^ ord(c)) & 0xFF 80 crch = temp1h ^ _table_h[idx] 81 crcl = temp1l 82 83 return "CRC-%08X%08X" % (crch, crcl)
84 85
86 -def gcg(seq):
87 """Returns the GCG checksum (int) for a sequence (string or Seq object). 88 89 Given a nucleotide or amino-acid secuence (or any string), 90 returns the GCG checksum (int). Checksum used by GCG program. 91 seq type = str. 92 93 Based on BioPerl GCG_checksum. Adapted by Sebastian Bassi 94 with the help of John Lenton, Pablo Ziliani, and Gabriel Genellina. 95 96 All sequences are converted to uppercase. 97 98 >>> gcg("ACGTACGTACGT") 99 5688 100 >>> gcg("acgtACGTacgt") 101 5688 102 103 """ 104 try: 105 # Assume its a Seq object 106 seq = str(seq) 107 except AttributeError: 108 # Assume its a string 109 pass 110 index = checksum = 0 111 for char in seq: 112 index += 1 113 checksum += index * ord(char.upper()) 114 if index == 57: 115 index = 0 116 return checksum % 10000
117 118
119 -def seguid(seq):
120 """Returns the SEGUID (string) for a sequence (string or Seq object). 121 122 Given a nucleotide or amino-acid secuence (or any string), 123 returns the SEGUID string (A SEquence Globally Unique IDentifier). 124 seq type = str. 125 126 Note that the case is not important: 127 128 >>> seguid("ACGTACGTACGT") 129 'If6HIvcnRSQDVNiAoefAzySc6i4' 130 >>> seguid("acgtACGTacgt") 131 'If6HIvcnRSQDVNiAoefAzySc6i4' 132 133 For more information about SEGUID, see: 134 http://bioinformatics.anl.gov/seguid/ 135 DOI: 10.1002/pmic.200600032 136 """ 137 import hashlib 138 import base64 139 m = hashlib.sha1() 140 try: 141 # Assume it's a Seq object 142 seq = str(seq) 143 except AttributeError: 144 # Assume it's a string 145 pass 146 m.update(_as_bytes(seq.upper())) 147 try: 148 # For Python 3+ 149 return base64.encodebytes(m.digest()).decode().replace("\n", "").rstrip("=") 150 except AttributeError: 151 pass 152 # For all other Pythons 153 return base64.b64encode(m.digest()).rstrip("=")
154 155 156 if __name__ == "__main__": 157 from Bio._utils import run_doctest 158 run_doctest() 159