Update extension to use single websocket and like 100 other things

This commit is contained in:
2025-01-25 23:28:32 -06:00
parent 7388ac18d4
commit 9378f77a61
12 changed files with 634 additions and 374 deletions

View File

@@ -4,15 +4,11 @@ from bs4 import BeautifulSoup
from typing import Optional
from urllib.parse import urlparse
from .config import ReaderConfig
import logging
from .database import SessionLocal, BlacklistedDomain
from .logging_config import setup_logger
from .database import SessionLocal
# Setup logging with less verbose output
logging.basicConfig(
level=logging.WARNING,
format='%(levelname)s: %(message)s'
)
logger = logging.getLogger(__name__)
# Setup logger for this module
logger = setup_logger(__name__)
# Patterns for cleaning
SCRIPT_PATTERN = r"<[ ]*script.*?\/[ ]*script[ ]*>"
@@ -26,13 +22,15 @@ SVG_PATTERN = r"(<svg[^>]*>)(.*?)(<\/svg>)"
class PageReader:
def __init__(self):
self.config = ReaderConfig()
self.db = SessionLocal()
logger.info("PageReader initialized")
def clean_html(self, html: str) -> str:
"""Clean HTML by removing unwanted elements and patterns."""
if not html:
logger.warning("Received empty HTML to clean")
return ""
logger.debug(f"Cleaning HTML of length: {len(html)}")
# First use regex to remove problematic patterns
html = re.sub(SCRIPT_PATTERN, "", html, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL)
html = re.sub(STYLE_PATTERN, "", html, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL)
@@ -54,12 +52,15 @@ class PageReader:
]
for element in elements_to_remove:
removed = len(soup.find_all(element))
if removed:
logger.debug(f"Removed {removed} {element} elements")
for tag in soup.find_all(element):
tag.decompose()
return str(soup)
except Exception as e:
logger.error(f"Error cleaning HTML: {e}")
logger.error(f"Error cleaning HTML: {e}", exc_info=True)
return ""
def clean_whitespace(self, text: str) -> str:
@@ -80,11 +81,17 @@ class PageReader:
def html_to_markdown(self, html: str) -> Optional[str]:
"""Convert HTML to markdown."""
try:
logger.info("Starting HTML to Markdown conversion")
logger.debug(f"Input HTML length: {len(html)}")
cleaned_html = self.clean_html(html)
logger.debug(f"Cleaned HTML length: {len(cleaned_html)}")
if not cleaned_html:
logger.warning("No cleaned HTML content")
return None
return self.clean_whitespace(md(cleaned_html,
markdown = self.clean_whitespace(md(cleaned_html,
heading_style="ATX",
bullets="-",
autolinks=True,
@@ -92,10 +99,19 @@ class PageReader:
escape_asterisks=True,
escape_underscores=True))
logger.debug(f"Generated markdown length: {len(markdown) if markdown else 0}")
if not markdown or markdown.isspace():
logger.warning("Markdown is empty or whitespace only")
return None
return markdown
except Exception as e:
logger.error(f"Error converting to markdown: {e}")
logger.error("Error converting to markdown", exc_info=True)
return None
def close(self):
async def close(self):
"""Cleanup resources"""
self.db.close()
logger.info("Closing PageReader")
pass # No need to close DB connection anymore