You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 

38 lines
1.3 KiB

import re
from django.contrib.auth.models import AnonymousUser
from django.http import HttpRequest
from django.template.loader import render_to_string
from django.utils.html import strip_tags
from shared.utils.text import html_entities_to_unicode
def render_page_as_html(page, template, context_data, css_selector=None):
request = HttpRequest()
request.user = AnonymousUser()
assert template, "No template supplied"
html = render_to_string(template, context_data, request=request)
if css_selector:
import lxml.html
doc = lxml.html.fromstring(html)
for element in doc.cssselect('script,style'):
element.getparent().remove(element)
html = []
for part in doc.cssselect(css_selector):
html.append(lxml.html.tostring(part).decode().strip())
html = '\n'.join(html)
return html
def render_page_as_text(page, template, context_data, css_selector=None):
html = render_page_as_html(page, template, context_data, css_selector)
text = html_entities_to_unicode(strip_tags(html)).strip()
text = re.sub('[\t ]+', ' ', text)
text = re.sub(re.compile('\n +', re.DOTALL), '\n', text)
text = re.sub(re.compile('\n+', re.DOTALL), '\n', text)
content = []
for word in text.split(' '):
if len(word) <= 245:
content += [word]
return ' '.join(content)