Files
browser-recall/app/page_reader.py

117 lines
4.5 KiB
Python

import re
from markdownify import markdownify as md
from bs4 import BeautifulSoup
from typing import Optional
from urllib.parse import urlparse
from .config import ReaderConfig
from .logging_config import setup_logger
from .database import SessionLocal
# Setup logger for this module
logger = setup_logger(__name__)
# Patterns for cleaning
SCRIPT_PATTERN = r"<[ ]*script.*?\/[ ]*script[ ]*>"
STYLE_PATTERN = r"<[ ]*style.*?\/[ ]*style[ ]*>"
META_PATTERN = r"<[ ]*meta.*?>"
COMMENT_PATTERN = r"<[ ]*!--.*?--[ ]*>"
LINK_PATTERN = r"<[ ]*link.*?>"
BASE64_IMG_PATTERN = r'<img[^>]+src="data:image/[^;]+;base64,[^"]+"[^>]*>'
SVG_PATTERN = r"(<svg[^>]*>)(.*?)(<\/svg>)"
class PageReader:
def __init__(self):
self.config = ReaderConfig()
logger.info("PageReader initialized")
def clean_html(self, html: str) -> str:
"""Clean HTML by removing unwanted elements and patterns."""
if not html:
logger.warning("Received empty HTML to clean")
return ""
logger.debug(f"Cleaning HTML of length: {len(html)}")
# First use regex to remove problematic patterns
html = re.sub(SCRIPT_PATTERN, "", html, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL)
html = re.sub(STYLE_PATTERN, "", html, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL)
html = re.sub(META_PATTERN, "", html, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL)
html = re.sub(COMMENT_PATTERN, "", html, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL)
html = re.sub(LINK_PATTERN, "", html, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL)
html = re.sub(SVG_PATTERN, "", html, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL)
html = re.sub(BASE64_IMG_PATTERN, "", html)
try:
# Use BeautifulSoup to remove additional elements we want to strip
soup = BeautifulSoup(html, 'html.parser')
# Remove unwanted elements
elements_to_remove = [
'canvas', 'img', 'picture', 'audio', 'video',
'iframe', 'embed', 'object', 'param', 'track',
'map', 'area', 'source'
]
for element in elements_to_remove:
removed = len(soup.find_all(element))
if removed:
logger.debug(f"Removed {removed} {element} elements")
for tag in soup.find_all(element):
tag.decompose()
return str(soup)
except Exception as e:
logger.error(f"Error cleaning HTML: {e}", exc_info=True)
return ""
def clean_whitespace(self, text: str) -> str:
"""Clean excessive whitespace from text."""
if not text:
return ""
try:
# Replace 3 or more newlines with 2 newlines
cleaned = re.sub(r'\n{3,}', '\n\n', text)
# Remove trailing whitespace from each line
cleaned = '\n'.join(line.rstrip() for line in cleaned.splitlines())
return cleaned.strip()
except Exception as e:
logger.error(f"Error cleaning whitespace: {e}")
return ""
def html_to_markdown(self, html: str) -> Optional[str]:
"""Convert HTML to markdown."""
try:
logger.info("Starting HTML to Markdown conversion")
logger.debug(f"Input HTML length: {len(html)}")
cleaned_html = self.clean_html(html)
logger.debug(f"Cleaned HTML length: {len(cleaned_html)}")
if not cleaned_html:
logger.warning("No cleaned HTML content")
return None
markdown = self.clean_whitespace(md(cleaned_html,
heading_style="ATX",
bullets="-",
autolinks=True,
strip=['form'],
escape_asterisks=True,
escape_underscores=True))
logger.debug(f"Generated markdown length: {len(markdown) if markdown else 0}")
if not markdown or markdown.isspace():
logger.warning("Markdown is empty or whitespace only")
return None
return markdown
except Exception as e:
logger.error("Error converting to markdown", exc_info=True)
return None
async def close(self):
"""Cleanup resources"""
logger.info("Closing PageReader")
pass # No need to close DB connection anymore