import codecs import html import re import translitcodec # provides 'translit/long', used by codecs.encode() # noqa from django.conf import settings from django.utils.encoding import force_str from django.utils.functional import keep_lazy_text from django.utils.html import mark_safe, strip_tags from django.utils.text import slugify as django_slugify, normalize_newlines from django.utils.translation import gettext_lazy @keep_lazy_text def downgrade(value): """ Downgrade unicode to ascii, transliterating accented characters. """ value = force_str(value or "") return codecs.encode(value, 'transliterate') @keep_lazy_text def slugify_long(value): return django_slugify(downgrade(value)) # Spreading umlauts is included in the translit/long codec. slugify_german = slugify_long @keep_lazy_text def downgrading_slugify(value): # Slugfiy only allowing hyphens, numbers and ASCII characters # FIXME django_slugify might return an empty string; take care that we always return something return re.sub("[ _]+", "-", django_slugify(downgrade(value))) SLUGIFY_FUNCTION = getattr(settings, 'SLUGIFY_FUNCTION', downgrading_slugify) slugify = SLUGIFY_FUNCTION @keep_lazy_text def html_entities_to_unicode(html_str): return html.unescape(html_str) # Translators: Separator between list elements DEFAULT_SEPARATOR = gettext_lazy(", ") # Translators: Last separator of list elements LAST_WORD_SEPARATOR = gettext_lazy(" and ") @keep_lazy_text def text_joined(list_, separator=DEFAULT_SEPARATOR, last_word=LAST_WORD_SEPARATOR): list_ = list(list_) if len(list_) == 0: return '' if len(list_) == 1: return force_str(list_[0]) return '%s%s%s' % ( separator.join(force_str(i) for i in list_[:-1]), force_str(last_word), force_str(list_[-1])) # TODO Don't match escaped stars (like \*) b_pattern = re.compile(r"(\*\*)(.*?)\1") i_pattern = re.compile(r"(\*)(.*?)\1") u_pattern = re.compile(r"(__)(.*?)\1") link_pattern = re.compile(r"\[([^\[]+)\]\(([^\)]+)\)") @keep_lazy_text def slimdown(text): """ Converts simplified markdown (`**`, `*`, `__`) to , und tags. """ if text: text, n = re.subn(b_pattern, "\\2", text) text, n = re.subn(i_pattern, "\\2", text) text, n = re.subn(u_pattern, "\\2", text) text, n = re.subn(link_pattern, "\\1", text) return mark_safe(text) else: return "" @keep_lazy_text def strip_links(text): return re.sub(r']+>', '', text, flags=re.DOTALL).replace('', '') COLLAPSE_WHITESPACE_RE = re.compile(r"\s+") @keep_lazy_text def collapse_whitespace(text): return COLLAPSE_WHITESPACE_RE.sub(" ", text).strip() @keep_lazy_text def html_to_text(text): print(text) rv = collapse_whitespace(strip_tags(html_entities_to_unicode(str(text)))) print(rv) return rv try: from html_sanitizer.django import get_sanitizer def sanitized_html(html, config_name='default'): return get_sanitizer(config_name).sanitize(html) except ImportError: pass