Source code for ankipandas.util.checksum
from __future__ import annotations
import re
from hashlib import sha1
from html.entities import name2codepoint
# Implementation directly copied from Anki (anki/anki/utils.py).
# Only a bit of PEP8ing and making things private.
_reComment = re.compile("(?s)<!--.*?-->")
_reStyle = re.compile("(?si)<style.*?>.*?</style>")
_reScript = re.compile("(?si)<script.*?>.*?</script>")
_reTag = re.compile("(?s)<.*?>")
_reEnts = re.compile(r"&#?\w+;")
_reMedia = re.compile("(?i)<img[^>]+src=[\"']?([^\"'>]+)[\"']?[^>]*>")
def _strip_html(s):
s = _reComment.sub("", s)
s = _reStyle.sub("", s)
s = _reScript.sub("", s)
s = _reTag.sub("", s)
s = _ents_to_txt(s)
return s
def _strip_html_media(s):
"""Strip HTML but keep media filenames"""
s = _reMedia.sub(" \\1 ", s)
return _strip_html(s)
def _ents_to_txt(html):
# entitydefs defines nbsp as \xa0 instead of a standard space, so we
# replace it first
html = html.replace(" ", " ")
def fixup(m):
text = m.group(0)
if text[:2] == "&#":
# character reference
try:
if text[:3] == "&#x":
return chr(int(text[3:-1], 16))
else:
return chr(int(text[2:-1]))
except ValueError:
pass
else:
# named entity
try:
text = chr(name2codepoint[text[1:-1]])
except KeyError:
pass
return text # leave as is
return _reEnts.sub(fixup, html)
def _checksum(data):
if isinstance(data, str):
data = data.encode("utf-8")
return sha1(data).hexdigest()
[docs]
def field_checksum(data: str) -> int:
"""32 bit unsigned number from first 8 digits of sha1 hash.
Apply this to the first field to the the field checksum that is used by
Anki to detect duplicates.
Args:
data: string like
Returns:
int
"""
return int(_checksum(_strip_html_media(data).encode("utf-8"))[:8], 16)