browser-recall/app/page_reader.py

import re
from markdownify import markdownify as md
from bs4 import BeautifulSoup
from typing import Optional
from urllib.parse import urlparse
from .config import ReaderConfig
import logging
from .database import SessionLocal, BlacklistedDomain

# Setup logging with less verbose output
logging.basicConfig(
    level=logging.WARNING,
    format='%(levelname)s: %(message)s'
)
logger = logging.getLogger(__name__)

# Patterns for cleaning
SCRIPT_PATTERN = r"<[ ]*script.*?\/[ ]*script[ ]*>"
STYLE_PATTERN = r"<[ ]*style.*?\/[ ]*style[ ]*>"
META_PATTERN = r"<[ ]*meta.*?>"
COMMENT_PATTERN = r"<[ ]*!--.*?--[ ]*>"
LINK_PATTERN = r"<[ ]*link.*?>"
BASE64_IMG_PATTERN = r'<img[^>]+src="data:image/[^;]+;base64,[^"]+"[^>]*>'
SVG_PATTERN = r"(<svg[^>]*>)(.*?)(<\/svg>)"

class PageReader:
    def __init__(self):
        self.config = ReaderConfig()
        self.db = SessionLocal()

    def clean_html(self, html: str) -> str:
        """Clean HTML by removing unwanted elements and patterns."""
        if not html:
            return ""

        # First use regex to remove problematic patterns
        html = re.sub(SCRIPT_PATTERN, "", html, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL)
        html = re.sub(STYLE_PATTERN, "", html, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL)
        html = re.sub(META_PATTERN, "", html, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL)
        html = re.sub(COMMENT_PATTERN, "", html, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL)
        html = re.sub(LINK_PATTERN, "", html, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL)
        html = re.sub(SVG_PATTERN, "", html, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL)
        html = re.sub(BASE64_IMG_PATTERN, "", html)

        try:
            # Use BeautifulSoup to remove additional elements we want to strip
            soup = BeautifulSoup(html, 'html.parser')

            # Remove unwanted elements
            elements_to_remove = [
                'canvas', 'img', 'picture', 'audio', 'video',
                'iframe', 'embed', 'object', 'param', 'track',
                'map', 'area', 'source'
            ]

            for element in elements_to_remove:
                for tag in soup.find_all(element):
                    tag.decompose()

            return str(soup)
        except Exception as e:
            logger.error(f"Error cleaning HTML: {e}")
            return ""

    def clean_whitespace(self, text: str) -> str:
        """Clean excessive whitespace from text."""
        if not text:
            return ""

        try:
            # Replace 3 or more newlines with 2 newlines
            cleaned = re.sub(r'\n{3,}', '\n\n', text)
            # Remove trailing whitespace from each line
            cleaned = '\n'.join(line.rstrip() for line in cleaned.splitlines())
            return cleaned.strip()
        except Exception as e:
            logger.error(f"Error cleaning whitespace: {e}")
            return ""

    def html_to_markdown(self, html: str) -> Optional[str]:
        """Convert HTML to markdown."""
        try:
            cleaned_html = self.clean_html(html)
            if not cleaned_html:
                return None

            return self.clean_whitespace(md(cleaned_html,
                                          heading_style="ATX",
                                          bullets="-",
                                          autolinks=True,
                                          strip=['form'],
                                          escape_asterisks=True,
                                          escape_underscores=True))

        except Exception as e:
            logger.error(f"Error converting to markdown: {e}")
            return None

    def close(self):
        """Cleanup resources"""
        self.db.close()