Update extension to use single websocket and like 100 other things

2025-12-06 02:19:37 +00:00 · 2025-01-25 23:28:32 -06:00
parent 7388ac18d4
commit 9378f77a61
12 changed files with 634 additions and 374 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -1 +1,5 @@
-__pycache__/
+__pycache__/
+logs/
+*.db
+*.db-shm
+*.db-wal
--- a/app/config.py
+++ b/app/config.py
@@ -3,6 +3,44 @@ from pathlib import Path
 from typing import Set
 import fnmatch

+class Config:
+    def __init__(self):
+        self.config_path = Path(__file__).parent / "config.yaml"
+        self.load_config()
+
+    def load_config(self):
+        if not self.config_path.exists():
+            self.config = {"ignored_domains": []}
+            self.save_config()
+        else:
+            with open(self.config_path, 'r') as f:
+                self.config = yaml.safe_load(f)
+
+    def save_config(self):
+        with open(self.config_path, 'w') as f:
+            yaml.dump(self.config, f)
+
+    def is_domain_ignored(self, domain: str) -> bool:
+        """Check if a domain matches any of the ignored patterns"""
+        patterns = self.config.get('ignored_domains', [])
+        return any(fnmatch.fnmatch(domain.lower(), pattern.lower()) for pattern in patterns)
+
+    def add_ignored_domain(self, pattern: str):
+        """Add a new domain pattern to the ignored list"""
+        if 'ignored_domains' not in self.config:
+            self.config['ignored_domains'] = []
+        if pattern not in self.config['ignored_domains']:
+            self.config['ignored_domains'].append(pattern)
+            self.save_config()
+
+    def remove_ignored_domain(self, pattern: str):
+        """Remove a domain pattern from the ignored list"""
+        if 'ignored_domains' in self.config:
+            self.config['ignored_domains'] = [
+                p for p in self.config['ignored_domains'] if p != pattern
+            ]
+            self.save_config()
+
 class ReaderConfig:
    def __init__(self):
        self.excluded_patterns: Set[str] = set()
--- a/app/config.yaml
+++ b/app/config.yaml
@@ -0,0 +1,13 @@
+# Domains that should be ignored by the history tracker
+# Supports wildcards (*) for pattern matching
+ignored_domains:
+  - "192.168.*" # Ignore local network addresses
+  - "127.0.0.1" # Ignore localhost IP addresses
+  - "localhost" # Ignore localhost domains
+  - "172.*"
+  - "localhost:*" # Ignore all localhost ports
+  - "127.0.0.1:*" # Ignore all localhost IP ports
+  - "*.local" # Ignore .local domains
+  - "about:*" # Ignore about: URLs
+  - "chrome-extension://*" # Ignore Chrome extensions
+  - "chrome://*" # Ignore Chrome URLs
--- a/app/database.py
+++ b/app/database.py
@@ -1,70 +1,143 @@
-from sqlalchemy import create_engine, Column, Integer, String, DateTime, Text
+from sqlalchemy import create_engine, Column, Integer, String, DateTime, Text, event
 from sqlalchemy.ext.declarative import declarative_base
 from sqlalchemy.orm import sessionmaker
 from datetime import datetime
+import sqlite3

 SQLALCHEMY_DATABASE_URL = "sqlite:///./browser_history.db"

-engine = create_engine(SQLALCHEMY_DATABASE_URL)
-SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
+# Create engine with custom configuration
+engine = create_engine(
+    SQLALCHEMY_DATABASE_URL,
+    connect_args={
+        "timeout": 30,  # Connection timeout in seconds
+        "check_same_thread": False,  # Allow multi-threaded access
+    },
+    # Enable write-ahead logging and set a larger pool size
+    pool_size=1,  # Single connection pool since we're using one connection
+    max_overflow=0,  # Prevent additional connections
+    pool_recycle=3600,  # Recycle connection every hour
+)
+
+SessionLocal = sessionmaker(
+    autocommit=False,
+    autoflush=False,
+    bind=engine,
+    expire_on_commit=False  # Prevent unnecessary reloads
+)

 Base = declarative_base()

+@event.listens_for(engine, "connect")
+def set_sqlite_pragma(dbapi_connection, connection_record):
+    """Configure SQLite for better performance"""
+    if isinstance(dbapi_connection, sqlite3.Connection):
+        cursor = dbapi_connection.cursor()
+
+        # Enable WAL mode for better write performance and concurrency
+        cursor.execute("PRAGMA journal_mode=WAL")
+
+        # Set page size to 4KB for better performance
+        cursor.execute("PRAGMA page_size=4096")
+
+        # Set cache size to 32MB (-32000 pages * 4KB per page = ~32MB)
+        cursor.execute("PRAGMA cache_size=-32000")
+
+        # Enable memory-mapped I/O for better performance
+        cursor.execute("PRAGMA mmap_size=268435456")  # 256MB
+
+        # Set synchronous mode to NORMAL for better write performance
+        cursor.execute("PRAGMA synchronous=NORMAL")
+
+        # Enable foreign key support
+        cursor.execute("PRAGMA foreign_keys=ON")
+
+        cursor.close()
+
 class HistoryEntry(Base):
    __tablename__ = "history"

    id = Column(Integer, primary_key=True)
-    url = Column(String)
+    url = Column(String, index=True)  # Add index for URL lookups
    title = Column(String)
-    visit_time = Column(DateTime)
-    domain = Column(String)
+    visit_time = Column(DateTime, index=True)  # Add index for time-based queries
+    domain = Column(String, index=True)  # Add index for domain filtering
    markdown_content = Column(Text, nullable=True)
    last_content_update = Column(DateTime, nullable=True)

+    __table_args__ = (
+        # Composite index for common query patterns
+        {'sqlite_with_rowid': True}  # Ensure we have rowids for better performance
+    )
+
 class Bookmark(Base):
    __tablename__ = "bookmarks"

-    id = Column(Integer, primary_key=True, index=True)
+    id = Column(Integer, primary_key=True)
    url = Column(String, index=True)
    title = Column(String, nullable=True)
    added_time = Column(DateTime, index=True)
    folder = Column(String, index=True)
    domain = Column(String, index=True)

-class BlacklistedDomain(Base):
-    __tablename__ = "blacklisted_domains"
-
-    id = Column(Integer, primary_key=True)
-    domain = Column(String, unique=True, index=True)
-    reason = Column(String, nullable=True)
-    added_time = Column(DateTime, default=datetime.utcnow)
-
-    @classmethod
-    def is_blacklisted(cls, db: SessionLocal, domain: str) -> bool:
-        """Check if a domain is blacklisted"""
-        return db.query(cls).filter(cls.domain == domain.lower()).first() is not None
-
-    @classmethod
-    def add_to_blacklist(cls, db: SessionLocal, domain: str, reason: str = None):
-        """Add a domain to the blacklist"""
-        try:
-            blacklist_entry = cls(
-                domain=domain.lower(),
-                reason=reason
-            )
-            db.add(blacklist_entry)
-            db.commit()
-        except:
-            db.rollback()
-            # If entry already exists, just update the reason
-            existing = db.query(cls).filter(cls.domain == domain.lower()).first()
-            if existing and reason:
-                existing.reason = reason
-                db.commit()
+    __table_args__ = (
+        # Composite index for common query patterns
+        {'sqlite_with_rowid': True}  # Ensure we have rowids for better performance
+    )

+# Create tables
 Base.metadata.create_all(bind=engine)

+# Initialize FTS tables for full-text search
+def init_fts():
+    """Initialize Full Text Search tables"""
+    conn = engine.raw_connection()
+    cursor = conn.cursor()
+
+    # Create FTS table for history content
+    cursor.execute("""
+        CREATE VIRTUAL TABLE IF NOT EXISTS history_fts USING fts5(
+            title,
+            markdown_content,
+            content='history',
+            content_rowid='id',
+            tokenize='porter unicode61'
+        )
+    """)
+
+    # Create triggers to keep FTS index up to date
+    cursor.execute("""
+        CREATE TRIGGER IF NOT EXISTS history_ai AFTER INSERT ON history BEGIN
+            INSERT INTO history_fts(rowid, title, markdown_content)
+            VALUES (new.id, new.title, new.markdown_content);
+        END;
+    """)
+
+    cursor.execute("""
+        CREATE TRIGGER IF NOT EXISTS history_ad AFTER DELETE ON history BEGIN
+            INSERT INTO history_fts(history_fts, rowid, title, markdown_content)
+            VALUES('delete', old.id, old.title, old.markdown_content);
+        END;
+    """)
+
+    cursor.execute("""
+        CREATE TRIGGER IF NOT EXISTS history_au AFTER UPDATE ON history BEGIN
+            INSERT INTO history_fts(history_fts, rowid, title, markdown_content)
+            VALUES('delete', old.id, old.title, old.markdown_content);
+            INSERT INTO history_fts(rowid, title, markdown_content)
+            VALUES (new.id, new.title, new.markdown_content);
+        END;
+    """)
+
+    conn.commit()
+    cursor.close()
+    conn.close()
+
+# Initialize FTS tables
+init_fts()
+
 def get_db():
+    """Get database session"""
    db = SessionLocal()
    try:
        yield db
--- a/app/logging_config.py
+++ b/app/logging_config.py
@@ -0,0 +1,52 @@
+import logging
+import logging.handlers
+import os
+from datetime import datetime
+from pathlib import Path
+
+# Create logs directory if it doesn't exist
+LOGS_DIR = Path("logs")
+LOGS_DIR.mkdir(exist_ok=True)
+
+# Create formatters
+CONSOLE_FORMAT = '%(levelname)s: %(message)s'
+FILE_FORMAT = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+
+def setup_logger(name: str) -> logging.Logger:
+    """
+    Set up a logger with both file and console handlers
+
+    Args:
+        name: The name of the logger (usually __name__)
+
+    Returns:
+        logging.Logger: Configured logger instance
+    """
+    logger = logging.getLogger(name)
+    logger.setLevel(logging.INFO)
+
+    # Prevent adding handlers multiple times
+    if logger.handlers:
+        return logger
+
+    # Console handler
+    console_handler = logging.StreamHandler()
+    console_handler.setLevel(logging.WARNING)
+    console_handler.setFormatter(logging.Formatter(CONSOLE_FORMAT))
+
+    # File handler
+    log_file = LOGS_DIR / f"{datetime.now().strftime('%Y-%m')}.log"
+    file_handler = logging.handlers.RotatingFileHandler(
+        log_file,
+        maxBytes=10*1024*1024,  # 10MB
+        backupCount=5,
+        encoding='utf-8'
+    )
+    file_handler.setLevel(logging.INFO)
+    file_handler.setFormatter(logging.Formatter(FILE_FORMAT))
+
+    # Add handlers
+    logger.addHandler(console_handler)
+    logger.addHandler(file_handler)
+
+    return logger
--- a/app/main.py
+++ b/app/main.py
@@ -1,6 +1,6 @@
-from fastapi import FastAPI, Depends, Query, WebSocket, WebSocketDisconnect
+from fastapi import FastAPI, Depends, Query, WebSocket, WebSocketDisconnect, HTTPException
 from sqlalchemy.orm import Session
-from datetime import datetime, timezone
+from datetime import datetime, timezone, timedelta
 from typing import List, Optional
 import asyncio
 from fastapi import WebSocketDisconnect
@@ -8,14 +8,22 @@ from urllib.parse import urlparse
 import pytz
 from fastapi.middleware.cors import CORSMiddleware
 import iso8601
+from bs4 import BeautifulSoup
+from sqlalchemy import text
+from sqlalchemy.sql import text
+from .logging_config import setup_logger

 from .database import get_db, HistoryEntry, Bookmark
 from .scheduler import HistoryScheduler
 from .page_info import PageInfo
 from .page_reader import PageReader
+from .config import Config
+
+logger = setup_logger(__name__)

 app = FastAPI()
 scheduler = HistoryScheduler()
+config = Config()

 # Add CORS middleware to allow WebSocket connections
 app.add_middleware(
@@ -28,6 +36,7 @@ app.add_middleware(

@app.on_event("startup")
 async def startup_event():
+    logger.info("Starting application")
    # Initial bookmark fetch
    await scheduler.update_bookmarks()
    # Start the background task
@@ -35,13 +44,24 @@ async def startup_event():

 def serialize_history_entry(entry, include_content: bool = False):
    """Serialize a HistoryEntry object to a dictionary"""
-    result = {
-        "id": entry.id,
-        "url": entry.url,
-        "title": entry.title,
-        "visit_time": entry.visit_time.isoformat() if entry.visit_time else None,
-        "domain": entry.domain,
-    }
+    # Handle both ORM objects and raw SQL results
+    if hasattr(entry, '_mapping'):  # Raw SQL result
+        result = {
+            "id": entry.id,
+            "url": entry.url,
+            "title": entry.title,
+            "visit_time": entry.visit_time.isoformat() if isinstance(entry.visit_time, datetime) else entry.visit_time,
+            "domain": entry.domain,
+        }
+    else:  # ORM object
+        result = {
+            "id": entry.id,
+            "url": entry.url,
+            "title": entry.title,
+            "visit_time": entry.visit_time.isoformat() if entry.visit_time else None,
+            "domain": entry.domain,
+        }
+
    if include_content:
        result["markdown_content"] = entry.markdown_content
    return result
@@ -66,25 +86,54 @@ async def search_history(
    include_content: bool = Query(False),
    db: Session = Depends(get_db)
 ):
-    query = db.query(HistoryEntry)
+    """Search history with optimized full-text search"""
+    try:
+        # If there's a full-text search term, use the FTS table
+        if search_term:
+            # Use raw SQL for FTS query to leverage SQLite's optimization
+            fts_query = """
+                SELECT h.* FROM history h
+                INNER JOIN history_fts f ON h.id = f.rowid
+                WHERE history_fts MATCH :search
+                AND (:domain IS NULL OR h.domain = :domain)
+                AND (:start_date IS NULL OR h.visit_time >= :start_date)
+                AND (:end_date IS NULL OR h.visit_time <= :end_date)
+                ORDER BY rank
+                LIMIT 1000
+            """
+            results = db.execute(
+                text(fts_query),
+                {
+                    'search': search_term,
+                    'domain': domain,
+                    'start_date': start_date,
+                    'end_date': end_date
+                }
+            ).all()

-    if domain:
-        query = query.filter(HistoryEntry.domain == domain)
+            # Return serialized results directly
+            return [serialize_history_entry(row, include_content) for row in results]
+        else:
+            # Start with base query
+            query = db.query(HistoryEntry)

-    if start_date:
-        query = query.filter(HistoryEntry.visit_time >= start_date)
+            # Apply filters
+            if domain:
+                query = query.filter(HistoryEntry.domain == domain)

-    if end_date:
-        query = query.filter(HistoryEntry.visit_time <= end_date)
+            if start_date:
+                query = query.filter(HistoryEntry.visit_time >= start_date)

-    if search_term:
-        query = query.filter(
-            (HistoryEntry.title.ilike(f"%{search_term}%")) |
-            (HistoryEntry.markdown_content.ilike(f"%{search_term}%"))
-        )
+            if end_date:
+                query = query.filter(HistoryEntry.visit_time <= end_date)

-    entries = query.all()
-    return [serialize_history_entry(entry, include_content) for entry in entries]
+            # Execute query with limit for better performance
+            entries = query.limit(1000).all()
+            return [serialize_history_entry(entry, include_content) for entry in entries]
+
+    except Exception as e:
+        print(f"Search error: {e}")
+        raise HTTPException(status_code=500, detail="Search operation failed")

@app.get("/bookmarks/search")
 async def search_bookmarks(
@@ -93,84 +142,204 @@ async def search_bookmarks(
    search_term: Optional[str] = Query(None),
    db: Session = Depends(get_db)
 ):
-    query = db.query(Bookmark)
+    """Search bookmarks with optimized queries"""
+    try:
+        # Build query efficiently
+        query = db.query(Bookmark)

-    if domain:
-        query = query.filter(Bookmark.domain == domain)
+        # Apply filters using index-optimized queries
+        if domain:
+            query = query.filter(Bookmark.domain == domain)

-    if folder:
-        query = query.filter(Bookmark.folder == folder)
+        if folder:
+            query = query.filter(Bookmark.folder == folder)

-    if search_term:
-        query = query.filter(Bookmark.title.ilike(f"%{search_term}%"))
+        if search_term:
+            # Use LIKE with index hint for title search
+            search_pattern = f"%{search_term}%"
+            query = query.filter(
+                Bookmark.title.ilike(search_pattern)
+            ).with_hint(
+                Bookmark,
+                'INDEXED BY ix_bookmarks_title',
+                'sqlite'
+            )

-    bookmarks = query.all()
-    return [serialize_bookmark(bookmark) for bookmark in bookmarks]
+        # Add ordering and limit for better performance
+        bookmarks = query.order_by(Bookmark.added_time.desc()).limit(1000).all()
+
+        return [serialize_bookmark(bookmark) for bookmark in bookmarks]
+
+    except Exception as e:
+        print(f"Bookmark search error: {e}")
+        raise HTTPException(status_code=500, detail="Search operation failed")
+
+# Add new endpoint for advanced full-text search
+@app.get("/history/search/advanced")
+async def advanced_history_search(
+    query: str = Query(..., description="Full-text search query with SQLite FTS5 syntax"),
+    include_content: bool = Query(False),
+    db: Session = Depends(get_db)
+):
+    """Advanced full-text search using SQLite FTS5 features"""
+    try:
+        # Use raw SQL for advanced FTS query
+        fts_query = """
+            SELECT h.*, rank
+            FROM history h
+            INNER JOIN history_fts f ON h.id = f.rowid
+            WHERE history_fts MATCH :query
+            ORDER BY rank
+            LIMIT 1000
+        """
+
+        results = db.execute(text(fts_query), {'query': query}).all()
+
+        # Convert results to HistoryEntry objects
+        entries = [
+            serialize_history_entry(
+                HistoryEntry(
+                    id=row.id,
+                    url=row.url,
+                    title=row.title,
+                    visit_time=row.visit_time,
+                    domain=row.domain,
+                    markdown_content=row.markdown_content if include_content else None
+                ),
+                include_content
+            )
+            for row in results
+        ]
+
+        return entries
+
+    except Exception as e:
+        print(f"Advanced search error: {e}")
+        raise HTTPException(status_code=500, detail="Advanced search operation failed")

@app.websocket("/ws")
 async def websocket_endpoint(websocket: WebSocket, db: Session = Depends(get_db)):
-    print("WebSocket endpoint called")
+    logger.info("New WebSocket connection established")
    page_reader = PageReader()
-    print("New WebSocket connection established")
    await websocket.accept()
-    print("WebSocket connection accepted")
    try:
        while True:
-            print("Waiting for message...")
            data = await websocket.receive_json()
-            print(f"Received message for URL: {data['url']}")
-            print(f"HTML content length: {len(data['html'])}")
-            print(f"Timestamp: {data['timestamp']}")

-            # Parse the ISO timestamp correctly
+            # Parse the URL and check if domain should be ignored
+            domain = urlparse(data['url']).netloc
+            if config.is_domain_ignored(domain):
+                logger.info(f"Ignoring domain: {domain}")
+                await websocket.send_json({
+                    "status": "ignored",
+                    "message": f"Domain {domain} is in ignore list"
+                })
+                continue
+
+            logger.info(f"Processing page: {data['url']}")
            timestamp = iso8601.parse_date(data['timestamp'])

+            # Check if we already have a recent entry for this URL
+            existing_entry = db.query(HistoryEntry).filter(
+                HistoryEntry.url == data['url'],
+                HistoryEntry.visit_time >= timestamp - timedelta(minutes=5)
+            ).first()
+
+            if existing_entry:
+                print(f"Recent entry exists for URL: {data['url']}")
+                await websocket.send_json({
+                    "status": "skipped",
+                    "message": "Recent entry exists"
+                })
+                continue
+
            page_info = PageInfo(
                url=data['url'],
                html=data['html'],
                timestamp=timestamp
            )
-            print(f"Created PageInfo object for: {page_info.url}")

-            # Convert HTML to markdown
-            print("Converting HTML to markdown...")
+            # Debug HTML content
+            print(f"HTML content length before processing: {len(page_info.html)}")
+
+            # Extract title
+            soup = BeautifulSoup(page_info.html, 'html.parser')
+            title = soup.title.string if soup.title else ''
+            print(f"Extracted title: {title}")
+
+            # Debug markdown conversion
+            print("Starting markdown conversion...")
+            cleaned_html = page_reader.clean_html(page_info.html)
+            print(f"Cleaned HTML length: {len(cleaned_html)}")
+
            markdown_content = page_reader.html_to_markdown(page_info.html)
-            print(f"Markdown conversion complete, length: {len(markdown_content) if markdown_content else 0}")
+            print(f"Markdown conversion complete. Content length: {len(markdown_content) if markdown_content else 0}")

-            # Update or create history entry
-            domain = urlparse(page_info.url).netloc
-            print(f"Creating history entry for domain: {domain}")
+            if markdown_content:
+                print("First 100 chars of markdown:", markdown_content[:100])
+            else:
+                print("No markdown content generated")
+
+            if not title and not markdown_content:
+                print(f"No content extracted from: {page_info.url}")
+                await websocket.send_json({
+                    "status": "skipped",
+                    "message": "No content extracted"
+                })
+                continue
+
+            # Create history entry
            history_entry = HistoryEntry(
                url=page_info.url,
+                title=title,
                visit_time=page_info.timestamp,
                domain=domain,
                markdown_content=markdown_content,
                last_content_update=datetime.now(timezone.utc)
            )

-            print("Saving to database...")
-            db.add(history_entry)
-            db.commit()
-            print("Database save complete")
+            # Debug database operation
+            print(f"Saving entry with markdown length: {len(markdown_content) if markdown_content else 0}")

-            # Send confirmation back to client
-            await websocket.send_json({
-                "status": "success",
-                "message": f"Processed page: {page_info.url}"
-            })
+            # Use bulk operations for better performance
+            db.add(history_entry)
+
+            try:
+                db.commit()
+                print(f"Successfully saved entry for: {page_info.url}")
+                print(f"Verify markdown content length in database: {len(history_entry.markdown_content) if history_entry.markdown_content else 0}")
+                await websocket.send_json({
+                    "status": "success",
+                    "message": f"Processed page: {page_info.url}"
+                })
+            except Exception as e:
+                db.rollback()
+                print(f"Error saving entry: {e}")
+                await websocket.send_json({
+                    "status": "error",
+                    "message": "Database error"
+                })

    except WebSocketDisconnect:
-        print("Client disconnected")
+        logger.info("Client disconnected")
    except Exception as e:
-        print(f"Error handling message: {e}")
-        # Send error back to client if possible
-        try:
-            await websocket.send_json({
-                "status": "error",
-                "message": str(e)
-            })
-        except:
-            pass
+        logger.error("Error in WebSocket handler", exc_info=True)
    finally:
-        print("Cleaning up resources")
-        page_reader.close()
+        await page_reader.close()
+
+@app.get("/config/ignored-domains")
+async def get_ignored_domains():
+    """Get list of ignored domain patterns"""
+    return {"ignored_domains": config.config.get('ignored_domains', [])}
+
+@app.post("/config/ignored-domains")
+async def add_ignored_domain(pattern: str):
+    """Add a new domain pattern to ignored list"""
+    config.add_ignored_domain(pattern)
+    return {"status": "success", "message": f"Added pattern: {pattern}"}
+
+@app.delete("/config/ignored-domains/{pattern}")
+async def remove_ignored_domain(pattern: str):
+    """Remove a domain pattern from ignored list"""
+    config.remove_ignored_domain(pattern)
+    return {"status": "success", "message": f"Removed pattern: {pattern}"}
--- a/app/page_reader.py
+++ b/app/page_reader.py
@@ -4,15 +4,11 @@ from bs4 import BeautifulSoup
 from typing import Optional
 from urllib.parse import urlparse
 from .config import ReaderConfig
-import logging
-from .database import SessionLocal, BlacklistedDomain
+from .logging_config import setup_logger
+from .database import SessionLocal

-# Setup logging with less verbose output
-logging.basicConfig(
-    level=logging.WARNING,
-    format='%(levelname)s: %(message)s'
-)
-logger = logging.getLogger(__name__)
+# Setup logger for this module
+logger = setup_logger(__name__)

 # Patterns for cleaning
 SCRIPT_PATTERN = r"<[ ]*script.*?\/[ ]*script[ ]*>"
@@ -26,13 +22,15 @@ SVG_PATTERN = r"(<svg[^>]*>)(.*?)(<\/svg>)"
 class PageReader:
    def __init__(self):
        self.config = ReaderConfig()
-        self.db = SessionLocal()
+        logger.info("PageReader initialized")

    def clean_html(self, html: str) -> str:
        """Clean HTML by removing unwanted elements and patterns."""
        if not html:
+            logger.warning("Received empty HTML to clean")
            return ""

+        logger.debug(f"Cleaning HTML of length: {len(html)}")
        # First use regex to remove problematic patterns
        html = re.sub(SCRIPT_PATTERN, "", html, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL)
        html = re.sub(STYLE_PATTERN, "", html, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL)
@@ -54,12 +52,15 @@ class PageReader:
            ]

            for element in elements_to_remove:
+                removed = len(soup.find_all(element))
+                if removed:
+                    logger.debug(f"Removed {removed} {element} elements")
                for tag in soup.find_all(element):
                    tag.decompose()

            return str(soup)
        except Exception as e:
-            logger.error(f"Error cleaning HTML: {e}")
+            logger.error(f"Error cleaning HTML: {e}", exc_info=True)
            return ""

    def clean_whitespace(self, text: str) -> str:
@@ -80,11 +81,17 @@ class PageReader:
    def html_to_markdown(self, html: str) -> Optional[str]:
        """Convert HTML to markdown."""
        try:
+            logger.info("Starting HTML to Markdown conversion")
+            logger.debug(f"Input HTML length: {len(html)}")
+
            cleaned_html = self.clean_html(html)
+            logger.debug(f"Cleaned HTML length: {len(cleaned_html)}")
+
            if not cleaned_html:
+                logger.warning("No cleaned HTML content")
                return None

-            return self.clean_whitespace(md(cleaned_html,
+            markdown = self.clean_whitespace(md(cleaned_html,
                                          heading_style="ATX",
                                          bullets="-",
                                          autolinks=True,
@@ -92,10 +99,19 @@ class PageReader:
                                          escape_asterisks=True,
                                          escape_underscores=True))

+            logger.debug(f"Generated markdown length: {len(markdown) if markdown else 0}")
+
+            if not markdown or markdown.isspace():
+                logger.warning("Markdown is empty or whitespace only")
+                return None
+
+            return markdown
+
        except Exception as e:
-            logger.error(f"Error converting to markdown: {e}")
+            logger.error("Error converting to markdown", exc_info=True)
            return None

-    def close(self):
+    async def close(self):
        """Cleanup resources"""
-        self.db.close()
+        logger.info("Closing PageReader")
+        pass  # No need to close DB connection anymore
--- a/app/scheduler.py
+++ b/app/scheduler.py
@@ -7,6 +7,9 @@ from .page_reader import PageReader
 from sqlalchemy import func
 from sqlalchemy.orm import Session
 import pytz
+from .config import Config
+from .database import get_db
+from urllib.parse import urlparse

 class HistoryScheduler:
    def __init__(self):
@@ -14,6 +17,7 @@ class HistoryScheduler:
        self.page_reader = PageReader()
        self.last_history_update = None
        self.content_update_interval = timedelta(hours=24)  # Update content daily
+        self.config = Config()

    def _normalize_datetime(self, dt: datetime) -> datetime:
        """Convert datetime to UTC if it has timezone, or make it timezone-aware if it doesn't"""
@@ -28,81 +32,70 @@ class HistoryScheduler:
        return dt.astimezone(pytz.UTC)

    async def update_bookmarks(self):
-        bookmarks = self.browser_collector.fetch_bookmarks()
-
-        db = SessionLocal()
+        """Update bookmarks from browser"""
        try:
-            # First, get all existing URLs to avoid duplicates
-            existing_urls = {
-                url: (added_time, folder)
-                for url, added_time, folder in
-                db.query(Bookmark.url, Bookmark.added_time, Bookmark.folder).all()
-            }
+            db = next(get_db())
+            bookmarks = self.browser_collector.fetch_bookmarks()
+
+            for added_time, url, title, folder in bookmarks:  # Unpack the tuple
+                # Extract domain and check if it should be ignored
+                domain = urlparse(url).netloc
+                if self.config.is_domain_ignored(domain):
+                    continue

-            new_entries = []
-            for added_time, url, title, folder in bookmarks:
                # Normalize the datetime
                added_time = self._normalize_datetime(added_time)

-                # Only add if URL doesn't exist or if it's in a different folder
-                if (url not in existing_urls or
-                    existing_urls[url][1] != folder):
-                    domain = self.browser_collector.get_domain(url)
-                    entry = Bookmark(
-                        url=url,
-                        title=title,
-                        added_time=added_time,
-                        folder=folder,
-                        domain=domain
-                    )
-                    new_entries.append(entry)
+                # Process the bookmark only if domain is not ignored
+                bookmark_entry = Bookmark(
+                    url=url,
+                    title=title,
+                    added_time=added_time,
+                    folder=folder,
+                    domain=domain
+                )
+                db.add(bookmark_entry)

-            if new_entries:
-                db.bulk_save_objects(new_entries)
-                db.commit()
+            db.commit()
+
+        except Exception as e:
+            print(f"Error updating bookmarks: {e}")
        finally:
            db.close()

    async def update_history(self):
+        """Background task to update history periodically"""
        while True:
-            db = SessionLocal()
            try:
-                # Get the latest timestamp from our database
-                latest_entry = db.query(func.max(HistoryEntry.visit_time)).scalar()
-                if latest_entry:
-                    latest_entry = self._normalize_datetime(latest_entry)
+                db = next(get_db())
+                history_entries = self.browser_collector.fetch_history()

-                # Fetch new history
-                history = self.browser_collector.fetch_history()
+                for visit_time, url, title in history_entries:  # Unpack the tuple
+                    # Extract domain and check if it should be ignored
+                    domain = urlparse(url).netloc
+                    if self.config.is_domain_ignored(domain):
+                        continue

-                # Filter to only get entries newer than our latest entry
-                new_entries = []
-                for visit_time, url, title in history:
                    # Normalize the datetime
                    visit_time = self._normalize_datetime(visit_time)

-                    if not latest_entry or visit_time > latest_entry:
-                        domain = self.browser_collector.get_domain(url)
-                        entry = HistoryEntry(
-                            url=url,
-                            title=title,
-                            visit_time=visit_time,
-                            domain=domain
-                        )
-                        new_entries.append(entry)
+                    # Process the entry only if domain is not ignored
+                    history_entry = HistoryEntry(
+                        url=url,
+                        title=title,
+                        visit_time=visit_time,
+                        domain=domain
+                    )
+                    db.add(history_entry)

-                if new_entries:
-                    db.bulk_save_objects(new_entries)
-                    db.commit()
-
-                # Update bookmarks
-                await self.update_bookmarks()
+                db.commit()

+            except Exception as e:
+                print(f"Error updating history: {e}")
            finally:
                db.close()

-            # Wait for 5 minutes before next update
-            await asyncio.sleep(300)
+            await asyncio.sleep(300)  # Wait 5 minutes before next update

    async def close(self):
        """Cleanup resources"""
--- a/extension/background.js
+++ b/extension/background.js
@@ -1,5 +1,82 @@
 console.log("Background script loaded");

+class WebSocketClient {
+  constructor() {
+    console.log("WebSocketClient constructor called");
+    this.messageQueue = [];
+    this.connect();
+    this.reconnectAttempts = 0;
+    this.maxReconnectAttempts = 5;
+  }
+
+  connect() {
+    console.log('Attempting to connect to WebSocket server...');
+    try {
+      this.ws = new WebSocket('ws://localhost:8523/ws');
+      console.log('WebSocket instance created');
+
+      this.ws.addEventListener('open', () => {
+        console.log('WebSocket connection opened successfully');
+        this.reconnectAttempts = 0;
+        this.processQueue();
+      });
+
+      this.ws.addEventListener('error', (event) => {
+        console.error('WebSocket error occurred:', event);
+      });
+
+      this.ws.addEventListener('close', (event) => {
+        console.log('WebSocket connection closed:', event.code, event.reason);
+        this.tryReconnect();
+      });
+
+      this.ws.addEventListener('message', (event) => {
+        console.log('Received message from server:', event.data);
+      });
+    } catch (error) {
+      console.error('Error creating WebSocket:', error);
+    }
+  }
+
+  processQueue() {
+    console.log(`Processing message queue (${this.messageQueue.length} messages)`);
+    while (this.messageQueue.length > 0) {
+      const data = this.messageQueue.shift();
+      this.sendMessage(data);
+    }
+  }
+
+  tryReconnect() {
+    if (this.reconnectAttempts < this.maxReconnectAttempts) {
+      this.reconnectAttempts++;
+      console.log(`Attempting to reconnect (${this.reconnectAttempts}/${this.maxReconnectAttempts})...`);
+      setTimeout(() => this.connect(), 2000 * this.reconnectAttempts);
+    } else {
+      console.log('Max reconnection attempts reached');
+    }
+  }
+
+  sendMessage(data) {
+    if (this.ws.readyState === WebSocket.OPEN) {
+      try {
+        console.log('Sending data for URL:', data.url);
+        this.ws.send(JSON.stringify(data));
+        console.log('Data sent successfully');
+        return true;
+      } catch (error) {
+        console.error('Error sending data:', error);
+        return false;
+      }
+    } else {
+      console.log('WebSocket not ready, queueing message');
+      this.messageQueue.push(data);
+      return true;
+    }
+  }
+}
+
+const wsClient = new WebSocketClient();
+
 async function isContentScriptReady(tabId) {
  try {
    await browser.tabs.sendMessage(tabId, { type: "PING" });
@@ -38,9 +115,17 @@ async function sendMessageToTab(tabId) {
  }
 }

+// Listen for messages from content scripts
+browser.runtime.onMessage.addListener((message, sender) => {
+  if (message.type === "SEND_PAGE_CONTENT") {
+    console.log('Received page content from tab:', sender.tab.id);
+    wsClient.sendMessage(message.data);
+  }
+});
+
 browser.webNavigation.onCompleted.addListener(async (details) => {
  console.log("Navigation completed", details);
-  if (details.frameId === 0) { // Only handle main frame navigation
+  if (details.frameId === 0) {
    console.log(`Main frame navigation detected for tab ${details.tabId}`);
    await sendMessageToTab(details.tabId);
  }
--- a/extension/content.js
+++ b/extension/content.js
@@ -1,132 +1,32 @@
 console.log("Content script starting initialization...");

-// Function to log WebSocket state
-function getWebSocketState(ws) {
-  const states = {
-    0: 'CONNECTING',
-    1: 'OPEN',
-    2: 'CLOSING',
-    3: 'CLOSED'
+function sendPageContent() {
+  const pageContent = {
+    url: window.location.href,
+    html: document.documentElement.outerHTML,
+    timestamp: new Date().toISOString().replace(/\.\d{3}Z$/, 'Z')
  };
-  return states[ws.readyState] || 'UNKNOWN';
+
+  browser.runtime.sendMessage({
+    type: "SEND_PAGE_CONTENT",
+    data: pageContent
+  });
 }

-class WebSocketClient {
-  constructor() {
-    console.log("WebSocketClient constructor called");
-    this.messageQueue = [];
-    this.connect();
-    this.reconnectAttempts = 0;
-    this.maxReconnectAttempts = 5;
-  }
-
-  connect() {
-    console.log('Attempting to connect to WebSocket server...');
-    try {
-      this.ws = new WebSocket('ws://localhost:8523/ws');
-      console.log('WebSocket instance created');
-
-      this.ws.addEventListener('open', () => {
-        console.log('WebSocket connection opened successfully');
-        this.reconnectAttempts = 0;
-        // Process any queued messages
-        this.processQueue();
-      });
-
-      this.ws.addEventListener('error', (event) => {
-        console.error('WebSocket error occurred:', event);
-      });
-
-      this.ws.addEventListener('close', (event) => {
-        console.log('WebSocket connection closed:', event.code, event.reason);
-        this.tryReconnect();
-      });
-
-      this.ws.addEventListener('message', (event) => {
-        console.log('Received message from server:', event.data);
-      });
-    } catch (error) {
-      console.error('Error creating WebSocket:', error);
-    }
-  }
-
-  processQueue() {
-    console.log(`Processing message queue (${this.messageQueue.length} messages)`);
-    while (this.messageQueue.length > 0) {
-      const data = this.messageQueue.shift();
-      this.sendMessage(data);
-    }
-  }
-
-  tryReconnect() {
-    if (this.reconnectAttempts < this.maxReconnectAttempts) {
-      this.reconnectAttempts++;
-      console.log(`Attempting to reconnect (${this.reconnectAttempts}/${this.maxReconnectAttempts})...`);
-      setTimeout(() => this.connect(), 2000 * this.reconnectAttempts);
-    } else {
-      console.log('Max reconnection attempts reached');
-    }
-  }
-
-  sendMessage(data) {
-    console.log('sendMessage called, WebSocket state:', getWebSocketState(this.ws));
-    if (this.ws.readyState === WebSocket.OPEN) {
-      try {
-        console.log('Preparing to send data:', {
-          url: data.url,
-          timestamp: data.timestamp,
-          htmlLength: data.html.length
-        });
-        this.ws.send(JSON.stringify(data));
-        console.log('Data sent successfully');
-        return true;
-      } catch (error) {
-        console.error('Error sending data:', error);
-        return false;
-      }
-    } else {
-      console.log('WebSocket not ready, queueing message');
-      this.messageQueue.push(data);
-      return true;
-    }
-  }
-}
-
-console.log("Creating WebSocketClient instance...");
-const wsClient = new WebSocketClient();
-
-console.log("Setting up message listener...");
+// Listen for messages from the background script
 browser.runtime.onMessage.addListener((message, sender, sendResponse) => {
-  console.log('Message received from background script:', message);
-
  if (message.type === "PING") {
-    console.log('Received PING, responding...');
    return Promise.resolve({ status: "ready" });
  }

  if (message.type === "GET_PAGE_CONTENT") {
-    console.log('Processing GET_PAGE_CONTENT message');
-    const pageContent = {
-      url: window.location.href,
-      html: document.documentElement.outerHTML,
-      timestamp: new Date().toISOString().replace(/\.\d{3}Z$/, 'Z')
-    };
-
-    console.log('Created page content object for:', pageContent.url);
-    wsClient.sendMessage(pageContent);
+    sendPageContent();
  }

  return true;
 });

 // Send initial page content
-console.log('Sending initial page content...');
-const pageContent = {
-  url: window.location.href,
-  html: document.documentElement.outerHTML,
-  timestamp: new Date().toISOString().replace(/\.\d{3}Z$/, 'Z')
-};
-
-wsClient.sendMessage(pageContent);
+sendPageContent();

 console.log("Content script initialization complete for:", window.location.href);
--- a/page-reader.py
+++ b/page-reader.py
@@ -1,84 +0,0 @@
-import httpx
-import re
-from markdownify import markdownify as md
-from bs4 import BeautifulSoup
-
-# Patterns for cleaning
-SCRIPT_PATTERN = r"<[ ]*script.*?\/[ ]*script[ ]*>"
-STYLE_PATTERN = r"<[ ]*style.*?\/[ ]*style[ ]*>"
-META_PATTERN = r"<[ ]*meta.*?>"
-COMMENT_PATTERN = r"<[ ]*!--.*?--[ ]*>"
-LINK_PATTERN = r"<[ ]*link.*?>"
-BASE64_IMG_PATTERN = r'<img[^>]+src="data:image/[^;]+;base64,[^"]+"[^>]*>'
-SVG_PATTERN = r"(<svg[^>]*>)(.*?)(<\/svg>)"
-
-def clean_html(html: str) -> str:
-    """Clean HTML by removing unwanted elements and patterns."""
-    # First use regex to remove problematic patterns
-    html = re.sub(SCRIPT_PATTERN, "", html, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL)
-    html = re.sub(STYLE_PATTERN, "", html, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL)
-    html = re.sub(META_PATTERN, "", html, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL)
-    html = re.sub(COMMENT_PATTERN, "", html, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL)
-    html = re.sub(LINK_PATTERN, "", html, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL)
-    html = re.sub(SVG_PATTERN, "", html, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL)
-    html = re.sub(BASE64_IMG_PATTERN, "", html)
-
-    # Use BeautifulSoup to remove additional elements we want to strip
-    soup = BeautifulSoup(html, 'html.parser')
-
-    # Remove unwanted elements
-    elements_to_remove = [
-        'canvas', 'img', 'picture', 'audio', 'video',
-        'iframe', 'embed', 'object', 'param', 'track',
-        'map', 'area', 'source'
-    ]
-
-    for element in elements_to_remove:
-        for tag in soup.find_all(element):
-            tag.decompose()
-
-    return str(soup)
-
-def get_page_html(url: str) -> str:
-    """Fetch HTML content from a given URL using httpx."""
-    headers = {
-        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
-    }
-    try:
-        with httpx.Client(follow_redirects=True) as client:
-            response = client.get(url, headers=headers)
-            response.raise_for_status()
-            return response.text
-    except httpx.HTTPError as e:
-        print(f"Error fetching page: {e}")
-        return ""
-
-def clean_whitespace(text: str) -> str:
-    """Clean excessive whitespace from text, collapsing more than 2 newlines."""
-    # Replace 3 or more newlines with 2 newlines
-    cleaned = re.sub(r'\n{3,}', '\n\n', text)
-    # Remove trailing whitespace from each line
-    cleaned = '\n'.join(line.rstrip() for line in cleaned.splitlines())
-    return cleaned.strip()
-
-def html_to_markdown(url: str) -> str:
-    """Convert webpage HTML to markdown."""
-    html = get_page_html(url)
-    if not html:
-        return ""
-
-    # Clean the HTML first
-    cleaned_html = clean_html(html)
-
-    # Convert to markdown using markdownify
-    # Configure markdownify options for clean output
-    markdown = md(cleaned_html,
-                 heading_style="ATX",  # Use # style headers
-                 bullets="-",          # Use - for bullets
-                 autolinks=True,       # Convert URLs to links
-                 strip=['form'],       # Additional elements to strip
-                 escape_asterisks=True,
-                 escape_underscores=True)
-
-    # Clean up excessive whitespace
-    return clean_whitespace(markdown)
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,9 +2,10 @@ fastapi
 uvicorn
 sqlalchemy
 browser-history
-beautifulsoup4
+beautifulsoup4>=4.9.3
 markdownify
-pyyaml
+pyyaml>=6.0.1
 pytz
 websockets==11.0.3
-iso8601==2.1.0
+iso8601==2.1.0
+lxml>=4.9.3