Update extension to use single websocket and like 100 other things

2025-12-06 10:29:38 +00:00 · 2025-01-25 23:28:32 -06:00
parent 7388ac18d4
commit 9378f77a61
12 changed files with 634 additions and 374 deletions
--- a/app/page_reader.py
+++ b/app/page_reader.py
@@ -4,15 +4,11 @@ from bs4 import BeautifulSoup
 from typing import Optional
 from urllib.parse import urlparse
 from .config import ReaderConfig
-import logging
-from .database import SessionLocal, BlacklistedDomain
+from .logging_config import setup_logger
+from .database import SessionLocal

-# Setup logging with less verbose output
-logging.basicConfig(
-    level=logging.WARNING,
-    format='%(levelname)s: %(message)s'
-)
-logger = logging.getLogger(__name__)
+# Setup logger for this module
+logger = setup_logger(__name__)

 # Patterns for cleaning
 SCRIPT_PATTERN = r"<[ ]*script.*?\/[ ]*script[ ]*>"
@@ -26,13 +22,15 @@ SVG_PATTERN = r"(<svg[^>]*>)(.*?)(<\/svg>)"
 class PageReader:
    def __init__(self):
        self.config = ReaderConfig()
-        self.db = SessionLocal()
+        logger.info("PageReader initialized")

    def clean_html(self, html: str) -> str:
        """Clean HTML by removing unwanted elements and patterns."""
        if not html:
+            logger.warning("Received empty HTML to clean")
            return ""

+        logger.debug(f"Cleaning HTML of length: {len(html)}")
        # First use regex to remove problematic patterns
        html = re.sub(SCRIPT_PATTERN, "", html, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL)
        html = re.sub(STYLE_PATTERN, "", html, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL)
@@ -54,12 +52,15 @@ class PageReader:
            ]

            for element in elements_to_remove:
+                removed = len(soup.find_all(element))
+                if removed:
+                    logger.debug(f"Removed {removed} {element} elements")
                for tag in soup.find_all(element):
                    tag.decompose()

            return str(soup)
        except Exception as e:
-            logger.error(f"Error cleaning HTML: {e}")
+            logger.error(f"Error cleaning HTML: {e}", exc_info=True)
            return ""

    def clean_whitespace(self, text: str) -> str:
@@ -80,11 +81,17 @@ class PageReader:
    def html_to_markdown(self, html: str) -> Optional[str]:
        """Convert HTML to markdown."""
        try:
+            logger.info("Starting HTML to Markdown conversion")
+            logger.debug(f"Input HTML length: {len(html)}")
+
            cleaned_html = self.clean_html(html)
+            logger.debug(f"Cleaned HTML length: {len(cleaned_html)}")
+
            if not cleaned_html:
+                logger.warning("No cleaned HTML content")
                return None

-            return self.clean_whitespace(md(cleaned_html,
+            markdown = self.clean_whitespace(md(cleaned_html,
                                          heading_style="ATX",
                                          bullets="-",
                                          autolinks=True,
@@ -92,10 +99,19 @@ class PageReader:
                                          escape_asterisks=True,
                                          escape_underscores=True))

+            logger.debug(f"Generated markdown length: {len(markdown) if markdown else 0}")
+
+            if not markdown or markdown.isspace():
+                logger.warning("Markdown is empty or whitespace only")
+                return None
+
+            return markdown
+
        except Exception as e:
-            logger.error(f"Error converting to markdown: {e}")
+            logger.error("Error converting to markdown", exc_info=True)
            return None

-    def close(self):
+    async def close(self):
        """Cleanup resources"""
-        self.db.close()
+        logger.info("Closing PageReader")
+        pass  # No need to close DB connection anymore