Package Bio :: Package SeqUtils :: Module CheckSum
[hide private]
[frames] | no frames]

Source Code for Module Bio.SeqUtils.CheckSum

  1  # Copyright 2002 by Yves Bastide and Brad Chapman. 
  2  # Copyright 2007 by Sebastian Bassi 
  3  # All rights reserved. 
  4  # This code is part of the Biopython distribution and governed by its 
  5  # license.  Please see the LICENSE file that should have been included 
  6  # as part of this package. 
  7   
  8  """Functions to calculate assorted sequence checksums.""" 
  9   
 10  # crc32, crc64, gcg, and seguid 
 11  # crc64 is adapted from BioPerl 
 12   
 13  from __future__ import print_function 
 14   
 15  from binascii import crc32 as _crc32 
 16  from Bio._py3k import _as_bytes 
 17   
 18   
19 -def crc32(seq):
20 """Returns the crc32 checksum for a sequence (string or Seq object).""" 21 #NOTE - On Python 2 returns a signed int, on Python 3 it is unsigned 22 #Docs suggest should use crc32(x) & 0xffffffff for consistency. 23 #TODO - Should we return crc32(x) & 0xffffffff here? 24 try: 25 #Assume its a Seq object 26 return _crc32(_as_bytes(str(seq))) 27 except AttributeError: 28 #Assume its a string/unicode 29 return _crc32(_as_bytes(seq))
30 31
32 -def _init_table_h():
33 _table_h = [] 34 for i in range(256): 35 l = i 36 part_h = 0 37 for j in range(8): 38 rflag = l & 1 39 l >>= 1 40 if part_h & 1: 41 l |= (1 << 31) 42 part_h >>= 1 43 if rflag: 44 part_h ^= 0xd8000000 45 _table_h.append(part_h) 46 return _table_h
47 48 # Initialisation 49 _table_h = _init_table_h() 50 51
52 -def crc64(s):
53 """Returns the crc64 checksum for a sequence (string or Seq object).""" 54 crcl = 0 55 crch = 0 56 for c in s: 57 shr = (crch & 0xFF) << 24 58 temp1h = crch >> 8 59 temp1l = (crcl >> 8) | shr 60 idx = (crcl ^ ord(c)) & 0xFF 61 crch = temp1h ^ _table_h[idx] 62 crcl = temp1l 63 64 return "CRC-%08X%08X" % (crch, crcl)
65 66
67 -def gcg(seq):
68 """Returns the GCG checksum (int) for a sequence (string or Seq object). 69 70 Given a nucleotide or amino-acid secuence (or any string), 71 returns the GCG checksum (int). Checksum used by GCG program. 72 seq type = str. 73 Based on BioPerl GCG_checksum. Adapted by Sebastian Bassi 74 with the help of John Lenton, Pablo Ziliani, and Gabriel Genellina. 75 All sequences are converted to uppercase """ 76 try: 77 #Assume its a Seq object 78 seq = str(seq) 79 except AttributeError: 80 #Assume its a string 81 pass 82 index = checksum = 0 83 for char in seq: 84 index += 1 85 checksum += index * ord(char.upper()) 86 if index == 57: 87 index = 0 88 return checksum % 10000
89 90
91 -def seguid(seq):
92 """Returns the SEGUID (string) for a sequence (string or Seq object). 93 94 Given a nucleotide or amino-acid secuence (or any string), 95 returns the SEGUID string (A SEquence Globally Unique IDentifier). 96 seq type = str. 97 For more information about SEGUID, see: 98 http://bioinformatics.anl.gov/seguid/ 99 DOI: 10.1002/pmic.200600032 """ 100 import hashlib 101 import base64 102 m = hashlib.sha1() 103 try: 104 #Assume it's a Seq object 105 seq = str(seq) 106 except AttributeError: 107 #Assume it's a string 108 pass 109 m.update(_as_bytes(seq.upper())) 110 try: 111 #For Python 3+ 112 return base64.encodebytes(m.digest()).decode().replace("\n", "").rstrip("=") 113 except AttributeError: 114 pass 115 # For all other Pythons 116 return base64.b64encode(m.digest()).rstrip("=")
117 118 119 if __name__ == "__main__": 120 print("Quick self test") 121 122 str_light_chain_one = "QSALTQPASVSGSPGQSITISCTGTSSDVGSYNLVSWYQQHPGK" \ 123 + "APKLMIYEGSKRPSGVSNRFSGSKSGNTASLTISGLQAEDEADY" \ 124 + "YCSSYAGSSTLVFGGGTKLTVL" 125 126 str_light_chain_two = "QSALTQPASVSGSPGQSITISCTGTSSDVGSYNLVSWYQQHPGK" \ 127 + "APKLMIYEGSKRPSGVSNRFSGSKSGNTASLTISGLQAEDEADY" \ 128 + "YCCSYAGSSTWVFGGGTKLTVL" 129 130 assert crc64(str_light_chain_one) == crc64(str_light_chain_two) 131 assert 'CRC-44CAAD88706CC153' == crc64(str_light_chain_one) 132 133 assert 'BpBeDdcNUYNsdk46JoJdw7Pd3BI' == seguid(str_light_chain_one) 134 assert 'X5XEaayob1nZLOc7eVT9qyczarY' == seguid(str_light_chain_two) 135 136 print("Done") 137