Package Bio :: Package SeqUtils :: Module CheckSum
[hide private]
[frames] | no frames]

Source Code for Module Bio.SeqUtils.CheckSum

  1  # Copyright 2002 by Yves Bastide and Brad Chapman. 
  2  # Copyright 2007 by Sebastian Bassi 
  3  # All rights reserved. 
  4  # This code is part of the Biopython distribution and governed by its 
  5  # license.  Please see the LICENSE file that should have been included 
  6  # as part of this package. 
  7   
  8  """Functions to calculate assorted sequence checksums.""" 
  9   
 10  # crc32, crc64, gcg, and seguid 
 11  # crc64 is adapted from BioPerl 
 12   
 13  from __future__ import print_function 
 14   
 15  from binascii import crc32 as _crc32 
 16  from Bio._py3k import _as_bytes 
 17   
 18  __docformat__ = "restructuredtext en" 
 19   
 20   
21 -def crc32(seq):
22 """Returns the crc32 checksum for a sequence (string or Seq object).""" 23 # NOTE - On Python 2 returns a signed int, on Python 3 it is unsigned 24 # Docs suggest should use crc32(x) & 0xffffffff for consistency. 25 # TODO - Should we return crc32(x) & 0xffffffff here? 26 try: 27 # Assume its a Seq object 28 return _crc32(_as_bytes(str(seq))) 29 except AttributeError: 30 # Assume its a string/unicode 31 return _crc32(_as_bytes(seq))
32 33
34 -def _init_table_h():
35 _table_h = [] 36 for i in range(256): 37 l = i 38 part_h = 0 39 for j in range(8): 40 rflag = l & 1 41 l >>= 1 42 if part_h & 1: 43 l |= (1 << 31) 44 part_h >>= 1 45 if rflag: 46 part_h ^= 0xd8000000 47 _table_h.append(part_h) 48 return _table_h
49 50 # Initialisation 51 _table_h = _init_table_h() 52 53
54 -def crc64(s):
55 """Returns the crc64 checksum for a sequence (string or Seq object).""" 56 crcl = 0 57 crch = 0 58 for c in s: 59 shr = (crch & 0xFF) << 24 60 temp1h = crch >> 8 61 temp1l = (crcl >> 8) | shr 62 idx = (crcl ^ ord(c)) & 0xFF 63 crch = temp1h ^ _table_h[idx] 64 crcl = temp1l 65 66 return "CRC-%08X%08X" % (crch, crcl)
67 68
69 -def gcg(seq):
70 """Returns the GCG checksum (int) for a sequence (string or Seq object). 71 72 Given a nucleotide or amino-acid secuence (or any string), 73 returns the GCG checksum (int). Checksum used by GCG program. 74 seq type = str. 75 76 Based on BioPerl GCG_checksum. Adapted by Sebastian Bassi 77 with the help of John Lenton, Pablo Ziliani, and Gabriel Genellina. 78 79 All sequences are converted to uppercase. 80 """ 81 try: 82 # Assume its a Seq object 83 seq = str(seq) 84 except AttributeError: 85 # Assume its a string 86 pass 87 index = checksum = 0 88 for char in seq: 89 index += 1 90 checksum += index * ord(char.upper()) 91 if index == 57: 92 index = 0 93 return checksum % 10000
94 95
96 -def seguid(seq):
97 """Returns the SEGUID (string) for a sequence (string or Seq object). 98 99 Given a nucleotide or amino-acid secuence (or any string), 100 returns the SEGUID string (A SEquence Globally Unique IDentifier). 101 seq type = str. 102 103 For more information about SEGUID, see: 104 http://bioinformatics.anl.gov/seguid/ 105 DOI: 10.1002/pmic.200600032 106 """ 107 import hashlib 108 import base64 109 m = hashlib.sha1() 110 try: 111 # Assume it's a Seq object 112 seq = str(seq) 113 except AttributeError: 114 # Assume it's a string 115 pass 116 m.update(_as_bytes(seq.upper())) 117 try: 118 # For Python 3+ 119 return base64.encodebytes(m.digest()).decode().replace("\n", "").rstrip("=") 120 except AttributeError: 121 pass 122 # For all other Pythons 123 return base64.b64encode(m.digest()).rstrip("=")
124 125 126 if __name__ == "__main__": 127 print("Quick self test") 128 129 str_light_chain_one = "QSALTQPASVSGSPGQSITISCTGTSSDVGSYNLVSWYQQHPGK" \ 130 + "APKLMIYEGSKRPSGVSNRFSGSKSGNTASLTISGLQAEDEADY" \ 131 + "YCSSYAGSSTLVFGGGTKLTVL" 132 133 str_light_chain_two = "QSALTQPASVSGSPGQSITISCTGTSSDVGSYNLVSWYQQHPGK" \ 134 + "APKLMIYEGSKRPSGVSNRFSGSKSGNTASLTISGLQAEDEADY" \ 135 + "YCCSYAGSSTWVFGGGTKLTVL" 136 137 assert crc64(str_light_chain_one) == crc64(str_light_chain_two) 138 assert 'CRC-44CAAD88706CC153' == crc64(str_light_chain_one) 139 140 assert 'BpBeDdcNUYNsdk46JoJdw7Pd3BI' == seguid(str_light_chain_one) 141 assert 'X5XEaayob1nZLOc7eVT9qyczarY' == seguid(str_light_chain_two) 142 143 print("Done") 144