mirror of
https://github.com/Zetaphor/browser-recall.git
synced 2025-12-06 10:29:38 +00:00
101 lines
3.7 KiB
Python
101 lines
3.7 KiB
Python
import re
|
|
from markdownify import markdownify as md
|
|
from bs4 import BeautifulSoup
|
|
from typing import Optional
|
|
from urllib.parse import urlparse
|
|
from .config import ReaderConfig
|
|
import logging
|
|
from .database import SessionLocal, BlacklistedDomain
|
|
|
|
# Setup logging with less verbose output
|
|
logging.basicConfig(
|
|
level=logging.WARNING,
|
|
format='%(levelname)s: %(message)s'
|
|
)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Patterns for cleaning
|
|
SCRIPT_PATTERN = r"<[ ]*script.*?\/[ ]*script[ ]*>"
|
|
STYLE_PATTERN = r"<[ ]*style.*?\/[ ]*style[ ]*>"
|
|
META_PATTERN = r"<[ ]*meta.*?>"
|
|
COMMENT_PATTERN = r"<[ ]*!--.*?--[ ]*>"
|
|
LINK_PATTERN = r"<[ ]*link.*?>"
|
|
BASE64_IMG_PATTERN = r'<img[^>]+src="data:image/[^;]+;base64,[^"]+"[^>]*>'
|
|
SVG_PATTERN = r"(<svg[^>]*>)(.*?)(<\/svg>)"
|
|
|
|
class PageReader:
|
|
def __init__(self):
|
|
self.config = ReaderConfig()
|
|
self.db = SessionLocal()
|
|
|
|
def clean_html(self, html: str) -> str:
|
|
"""Clean HTML by removing unwanted elements and patterns."""
|
|
if not html:
|
|
return ""
|
|
|
|
# First use regex to remove problematic patterns
|
|
html = re.sub(SCRIPT_PATTERN, "", html, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL)
|
|
html = re.sub(STYLE_PATTERN, "", html, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL)
|
|
html = re.sub(META_PATTERN, "", html, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL)
|
|
html = re.sub(COMMENT_PATTERN, "", html, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL)
|
|
html = re.sub(LINK_PATTERN, "", html, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL)
|
|
html = re.sub(SVG_PATTERN, "", html, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL)
|
|
html = re.sub(BASE64_IMG_PATTERN, "", html)
|
|
|
|
try:
|
|
# Use BeautifulSoup to remove additional elements we want to strip
|
|
soup = BeautifulSoup(html, 'html.parser')
|
|
|
|
# Remove unwanted elements
|
|
elements_to_remove = [
|
|
'canvas', 'img', 'picture', 'audio', 'video',
|
|
'iframe', 'embed', 'object', 'param', 'track',
|
|
'map', 'area', 'source'
|
|
]
|
|
|
|
for element in elements_to_remove:
|
|
for tag in soup.find_all(element):
|
|
tag.decompose()
|
|
|
|
return str(soup)
|
|
except Exception as e:
|
|
logger.error(f"Error cleaning HTML: {e}")
|
|
return ""
|
|
|
|
def clean_whitespace(self, text: str) -> str:
|
|
"""Clean excessive whitespace from text."""
|
|
if not text:
|
|
return ""
|
|
|
|
try:
|
|
# Replace 3 or more newlines with 2 newlines
|
|
cleaned = re.sub(r'\n{3,}', '\n\n', text)
|
|
# Remove trailing whitespace from each line
|
|
cleaned = '\n'.join(line.rstrip() for line in cleaned.splitlines())
|
|
return cleaned.strip()
|
|
except Exception as e:
|
|
logger.error(f"Error cleaning whitespace: {e}")
|
|
return ""
|
|
|
|
def html_to_markdown(self, html: str) -> Optional[str]:
|
|
"""Convert HTML to markdown."""
|
|
try:
|
|
cleaned_html = self.clean_html(html)
|
|
if not cleaned_html:
|
|
return None
|
|
|
|
return self.clean_whitespace(md(cleaned_html,
|
|
heading_style="ATX",
|
|
bullets="-",
|
|
autolinks=True,
|
|
strip=['form'],
|
|
escape_asterisks=True,
|
|
escape_underscores=True))
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error converting to markdown: {e}")
|
|
return None
|
|
|
|
def close(self):
|
|
"""Cleanup resources"""
|
|
self.db.close() |