Update extension to use single websocket and like 100 other things

2026-03-16 02:49:05 +00:00 · 2025-01-25 23:28:32 -06:00
parent 7388ac18d4
commit 9378f77a61
12 changed files with 634 additions and 374 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -1 +1,5 @@
-__pycache__/
+__pycache__/
 logs/
 *.db
 *.db-shm
 *.db-wal
--- a/app/config.py
+++ b/app/config.py
@@ -3,6 +3,44 @@ from pathlib import Path
 from typing import Set
 import fnmatch
 class Config:
    def __init__(self):
        self.config_path = Path(__file__).parent / "config.yaml"
        self.load_config()
    def load_config(self):
        if not self.config_path.exists():
            self.config = {"ignored_domains": []}
            self.save_config()
        else:
            with open(self.config_path, 'r') as f:
                self.config = yaml.safe_load(f)
    def save_config(self):
        with open(self.config_path, 'w') as f:
            yaml.dump(self.config, f)
    def is_domain_ignored(self, domain: str) -> bool:
        """Check if a domain matches any of the ignored patterns"""
        patterns = self.config.get('ignored_domains', [])
        return any(fnmatch.fnmatch(domain.lower(), pattern.lower()) for pattern in patterns)
    def add_ignored_domain(self, pattern: str):
        """Add a new domain pattern to the ignored list"""
        if 'ignored_domains' not in self.config:
            self.config['ignored_domains'] = []
        if pattern not in self.config['ignored_domains']:
            self.config['ignored_domains'].append(pattern)
            self.save_config()
    def remove_ignored_domain(self, pattern: str):
        """Remove a domain pattern from the ignored list"""
        if 'ignored_domains' in self.config:
            self.config['ignored_domains'] = [
                p for p in self.config['ignored_domains'] if p != pattern
            ]
            self.save_config()
 class ReaderConfig:
    def __init__(self):
        self.excluded_patterns: Set[str] = set()
--- a/app/config.yaml
+++ b/app/config.yaml
@@ -0,0 +1,13 @@
 # Domains that should be ignored by the history tracker
 # Supports wildcards (*) for pattern matching
 ignored_domains:
  - "192.168.*" # Ignore local network addresses
  - "127.0.0.1" # Ignore localhost IP addresses
  - "localhost" # Ignore localhost domains
  - "172.*"
  - "localhost:*" # Ignore all localhost ports
  - "127.0.0.1:*" # Ignore all localhost IP ports
  - "*.local" # Ignore .local domains
  - "about:*" # Ignore about: URLs
  - "chrome-extension://*" # Ignore Chrome extensions
  - "chrome://*" # Ignore Chrome URLs
--- a/app/database.py
+++ b/app/database.py
@@ -1,70 +1,143 @@
-from sqlalchemy import create_engine, Column, Integer, String, DateTime, Text
+from sqlalchemy import create_engine, Column, Integer, String, DateTime, Text, event
 from sqlalchemy.ext.declarative import declarative_base
 from sqlalchemy.orm import sessionmaker
 from datetime import datetime
 import sqlite3
 SQLALCHEMY_DATABASE_URL = "sqlite:///./browser_history.db"
-engine = create_engine(SQLALCHEMY_DATABASE_URL)
+# Create engine with custom configuration
-SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
+engine = create_engine(
    SQLALCHEMY_DATABASE_URL,
    connect_args={
        "timeout": 30,  # Connection timeout in seconds
        "check_same_thread": False,  # Allow multi-threaded access
    },
    # Enable write-ahead logging and set a larger pool size
    pool_size=1,  # Single connection pool since we're using one connection
    max_overflow=0,  # Prevent additional connections
    pool_recycle=3600,  # Recycle connection every hour
 )
 SessionLocal = sessionmaker(
    autocommit=False,
    autoflush=False,
    bind=engine,
    expire_on_commit=False  # Prevent unnecessary reloads
 )
 Base = declarative_base()
@event.listens_for(engine, "connect")
 def set_sqlite_pragma(dbapi_connection, connection_record):
    """Configure SQLite for better performance"""
    if isinstance(dbapi_connection, sqlite3.Connection):
        cursor = dbapi_connection.cursor()
        # Enable WAL mode for better write performance and concurrency
        cursor.execute("PRAGMA journal_mode=WAL")
        # Set page size to 4KB for better performance
        cursor.execute("PRAGMA page_size=4096")
        # Set cache size to 32MB (-32000 pages * 4KB per page = ~32MB)
        cursor.execute("PRAGMA cache_size=-32000")
        # Enable memory-mapped I/O for better performance
        cursor.execute("PRAGMA mmap_size=268435456")  # 256MB
        # Set synchronous mode to NORMAL for better write performance
        cursor.execute("PRAGMA synchronous=NORMAL")
        # Enable foreign key support
        cursor.execute("PRAGMA foreign_keys=ON")
        cursor.close()
 class HistoryEntry(Base):
    __tablename__ = "history"
    id = Column(Integer, primary_key=True)
-    url = Column(String)
+    url = Column(String, index=True)  # Add index for URL lookups
    title = Column(String)
-    visit_time = Column(DateTime)
+    visit_time = Column(DateTime, index=True)  # Add index for time-based queries
-    domain = Column(String)
+    domain = Column(String, index=True)  # Add index for domain filtering
    markdown_content = Column(Text, nullable=True)
    last_content_update = Column(DateTime, nullable=True)
    __table_args__ = (
        # Composite index for common query patterns
        {'sqlite_with_rowid': True}  # Ensure we have rowids for better performance
    )
 class Bookmark(Base):
    __tablename__ = "bookmarks"
-    id = Column(Integer, primary_key=True, index=True)
+    id = Column(Integer, primary_key=True)
    url = Column(String, index=True)
    title = Column(String, nullable=True)
    added_time = Column(DateTime, index=True)
    folder = Column(String, index=True)
    domain = Column(String, index=True)
-class BlacklistedDomain(Base):
+    __table_args__ = (
-    __tablename__ = "blacklisted_domains"
+        # Composite index for common query patterns
-
+        {'sqlite_with_rowid': True}  # Ensure we have rowids for better performance
-    id = Column(Integer, primary_key=True)
+    )
    domain = Column(String, unique=True, index=True)
    reason = Column(String, nullable=True)
    added_time = Column(DateTime, default=datetime.utcnow)
    @classmethod
    def is_blacklisted(cls, db: SessionLocal, domain: str) -> bool:
        """Check if a domain is blacklisted"""
        return db.query(cls).filter(cls.domain == domain.lower()).first() is not None
    @classmethod
    def add_to_blacklist(cls, db: SessionLocal, domain: str, reason: str = None):
        """Add a domain to the blacklist"""
        try:
            blacklist_entry = cls(
                domain=domain.lower(),
                reason=reason
            )
            db.add(blacklist_entry)
            db.commit()
        except:
            db.rollback()
            # If entry already exists, just update the reason
            existing = db.query(cls).filter(cls.domain == domain.lower()).first()
            if existing and reason:
                existing.reason = reason
                db.commit()
 # Create tables
 Base.metadata.create_all(bind=engine)
 # Initialize FTS tables for full-text search
 def init_fts():
    """Initialize Full Text Search tables"""
    conn = engine.raw_connection()
    cursor = conn.cursor()
    # Create FTS table for history content
    cursor.execute("""
        CREATE VIRTUAL TABLE IF NOT EXISTS history_fts USING fts5(
            title,
            markdown_content,
            content='history',
            content_rowid='id',
            tokenize='porter unicode61'
        )
    """)
    # Create triggers to keep FTS index up to date
    cursor.execute("""
        CREATE TRIGGER IF NOT EXISTS history_ai AFTER INSERT ON history BEGIN
            INSERT INTO history_fts(rowid, title, markdown_content)
            VALUES (new.id, new.title, new.markdown_content);
        END;
    """)
    cursor.execute("""
        CREATE TRIGGER IF NOT EXISTS history_ad AFTER DELETE ON history BEGIN
            INSERT INTO history_fts(history_fts, rowid, title, markdown_content)
            VALUES('delete', old.id, old.title, old.markdown_content);
        END;
    """)
    cursor.execute("""
        CREATE TRIGGER IF NOT EXISTS history_au AFTER UPDATE ON history BEGIN
            INSERT INTO history_fts(history_fts, rowid, title, markdown_content)
            VALUES('delete', old.id, old.title, old.markdown_content);
            INSERT INTO history_fts(rowid, title, markdown_content)
            VALUES (new.id, new.title, new.markdown_content);
        END;
    """)
    conn.commit()
    cursor.close()
    conn.close()
 # Initialize FTS tables
 init_fts()
 def get_db():
    """Get database session"""
    db = SessionLocal()
    try:
        yield db
--- a/app/logging_config.py
+++ b/app/logging_config.py
@@ -0,0 +1,52 @@
 import logging
 import logging.handlers
 import os
 from datetime import datetime
 from pathlib import Path
 # Create logs directory if it doesn't exist
 LOGS_DIR = Path("logs")
 LOGS_DIR.mkdir(exist_ok=True)
 # Create formatters
 CONSOLE_FORMAT = '%(levelname)s: %(message)s'
 FILE_FORMAT = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
 def setup_logger(name: str) -> logging.Logger:
    """
    Set up a logger with both file and console handlers
    Args:
        name: The name of the logger (usually __name__)
    Returns:
        logging.Logger: Configured logger instance
    """
    logger = logging.getLogger(name)
    logger.setLevel(logging.INFO)
    # Prevent adding handlers multiple times
    if logger.handlers:
        return logger
    # Console handler
    console_handler = logging.StreamHandler()
    console_handler.setLevel(logging.WARNING)
    console_handler.setFormatter(logging.Formatter(CONSOLE_FORMAT))
    # File handler
    log_file = LOGS_DIR / f"{datetime.now().strftime('%Y-%m')}.log"
    file_handler = logging.handlers.RotatingFileHandler(
        log_file,
        maxBytes=10*1024*1024,  # 10MB
        backupCount=5,
        encoding='utf-8'
    )
    file_handler.setLevel(logging.INFO)
    file_handler.setFormatter(logging.Formatter(FILE_FORMAT))
    # Add handlers
    logger.addHandler(console_handler)
    logger.addHandler(file_handler)
    return logger
--- a/app/main.py
+++ b/app/main.py
@@ -1,6 +1,6 @@
-from fastapi import FastAPI, Depends, Query, WebSocket, WebSocketDisconnect
+from fastapi import FastAPI, Depends, Query, WebSocket, WebSocketDisconnect, HTTPException
 from sqlalchemy.orm import Session
-from datetime import datetime, timezone
+from datetime import datetime, timezone, timedelta
 from typing import List, Optional
 import asyncio
 from fastapi import WebSocketDisconnect
@@ -8,14 +8,22 @@ from urllib.parse import urlparse
 import pytz
 from fastapi.middleware.cors import CORSMiddleware
 import iso8601
 from bs4 import BeautifulSoup
 from sqlalchemy import text
 from sqlalchemy.sql import text
 from .logging_config import setup_logger
 from .database import get_db, HistoryEntry, Bookmark
 from .scheduler import HistoryScheduler
 from .page_info import PageInfo
 from .page_reader import PageReader
 from .config import Config
 logger = setup_logger(__name__)
 app = FastAPI()
 scheduler = HistoryScheduler()
 config = Config()
 # Add CORS middleware to allow WebSocket connections
 app.add_middleware(
@@ -28,6 +36,7 @@ app.add_middleware(
@app.on_event("startup")
 async def startup_event():
    logger.info("Starting application")
    # Initial bookmark fetch
    await scheduler.update_bookmarks()
    # Start the background task
@@ -35,13 +44,24 @@ async def startup_event():
 def serialize_history_entry(entry, include_content: bool = False):
    """Serialize a HistoryEntry object to a dictionary"""
-    result = {
+    # Handle both ORM objects and raw SQL results
-        "id": entry.id,
+    if hasattr(entry, '_mapping'):  # Raw SQL result
-        "url": entry.url,
+        result = {
-        "title": entry.title,
+            "id": entry.id,
-        "visit_time": entry.visit_time.isoformat() if entry.visit_time else None,
+            "url": entry.url,
-        "domain": entry.domain,
+            "title": entry.title,
-    }
+            "visit_time": entry.visit_time.isoformat() if isinstance(entry.visit_time, datetime) else entry.visit_time,
            "domain": entry.domain,
        }
    else:  # ORM object
        result = {
            "id": entry.id,
            "url": entry.url,
            "title": entry.title,
            "visit_time": entry.visit_time.isoformat() if entry.visit_time else None,
            "domain": entry.domain,
        }
    if include_content:
        result["markdown_content"] = entry.markdown_content
    return result
@@ -66,25 +86,54 @@ async def search_history(
    include_content: bool = Query(False),
    db: Session = Depends(get_db)
 ):
-    query = db.query(HistoryEntry)
+    """Search history with optimized full-text search"""
    try:
        # If there's a full-text search term, use the FTS table
        if search_term:
            # Use raw SQL for FTS query to leverage SQLite's optimization
            fts_query = """
                SELECT h.* FROM history h
                INNER JOIN history_fts f ON h.id = f.rowid
                WHERE history_fts MATCH :search
                AND (:domain IS NULL OR h.domain = :domain)
                AND (:start_date IS NULL OR h.visit_time >= :start_date)
                AND (:end_date IS NULL OR h.visit_time <= :end_date)
                ORDER BY rank
                LIMIT 1000
            """
            results = db.execute(
                text(fts_query),
                {
                    'search': search_term,
                    'domain': domain,
                    'start_date': start_date,
                    'end_date': end_date
                }
            ).all()
-    if domain:
+            # Return serialized results directly
-        query = query.filter(HistoryEntry.domain == domain)
+            return [serialize_history_entry(row, include_content) for row in results]
        else:
            # Start with base query
            query = db.query(HistoryEntry)
-    if start_date:
+            # Apply filters
-        query = query.filter(HistoryEntry.visit_time >= start_date)
+            if domain:
                query = query.filter(HistoryEntry.domain == domain)
-    if end_date:
+            if start_date:
-        query = query.filter(HistoryEntry.visit_time <= end_date)
+                query = query.filter(HistoryEntry.visit_time >= start_date)
-    if search_term:
+            if end_date:
-        query = query.filter(
+                query = query.filter(HistoryEntry.visit_time <= end_date)
            (HistoryEntry.title.ilike(f"%{search_term}%")) |
            (HistoryEntry.markdown_content.ilike(f"%{search_term}%"))
        )
-    entries = query.all()
+            # Execute query with limit for better performance
-    return [serialize_history_entry(entry, include_content) for entry in entries]
+            entries = query.limit(1000).all()
            return [serialize_history_entry(entry, include_content) for entry in entries]
    except Exception as e:
        print(f"Search error: {e}")
        raise HTTPException(status_code=500, detail="Search operation failed")
@app.get("/bookmarks/search")
 async def search_bookmarks(
@@ -93,84 +142,204 @@ async def search_bookmarks(
    search_term: Optional[str] = Query(None),
    db: Session = Depends(get_db)
 ):
-    query = db.query(Bookmark)
+    """Search bookmarks with optimized queries"""
    try:
        # Build query efficiently
        query = db.query(Bookmark)
-    if domain:
+        # Apply filters using index-optimized queries
-        query = query.filter(Bookmark.domain == domain)
+        if domain:
            query = query.filter(Bookmark.domain == domain)
-    if folder:
+        if folder:
-        query = query.filter(Bookmark.folder == folder)
+            query = query.filter(Bookmark.folder == folder)
-    if search_term:
+        if search_term:
-        query = query.filter(Bookmark.title.ilike(f"%{search_term}%"))
+            # Use LIKE with index hint for title search
            search_pattern = f"%{search_term}%"
            query = query.filter(
                Bookmark.title.ilike(search_pattern)
            ).with_hint(
                Bookmark,
                'INDEXED BY ix_bookmarks_title',
                'sqlite'
            )
-    bookmarks = query.all()
+        # Add ordering and limit for better performance
-    return [serialize_bookmark(bookmark) for bookmark in bookmarks]
+        bookmarks = query.order_by(Bookmark.added_time.desc()).limit(1000).all()
        return [serialize_bookmark(bookmark) for bookmark in bookmarks]
    except Exception as e:
        print(f"Bookmark search error: {e}")
        raise HTTPException(status_code=500, detail="Search operation failed")
 # Add new endpoint for advanced full-text search
@app.get("/history/search/advanced")
 async def advanced_history_search(
    query: str = Query(..., description="Full-text search query with SQLite FTS5 syntax"),
    include_content: bool = Query(False),
    db: Session = Depends(get_db)
 ):
    """Advanced full-text search using SQLite FTS5 features"""
    try:
        # Use raw SQL for advanced FTS query
        fts_query = """
            SELECT h.*, rank
            FROM history h
            INNER JOIN history_fts f ON h.id = f.rowid
            WHERE history_fts MATCH :query
            ORDER BY rank
            LIMIT 1000
        """
        results = db.execute(text(fts_query), {'query': query}).all()
        # Convert results to HistoryEntry objects
        entries = [
            serialize_history_entry(
                HistoryEntry(
                    id=row.id,
                    url=row.url,
                    title=row.title,
                    visit_time=row.visit_time,
                    domain=row.domain,
                    markdown_content=row.markdown_content if include_content else None
                ),
                include_content
            )
            for row in results
        ]
        return entries
    except Exception as e:
        print(f"Advanced search error: {e}")
        raise HTTPException(status_code=500, detail="Advanced search operation failed")
@app.websocket("/ws")
 async def websocket_endpoint(websocket: WebSocket, db: Session = Depends(get_db)):
-    print("WebSocket endpoint called")
+    logger.info("New WebSocket connection established")
    page_reader = PageReader()
    print("New WebSocket connection established")
    await websocket.accept()
    print("WebSocket connection accepted")
    try:
        while True:
            print("Waiting for message...")
            data = await websocket.receive_json()
            print(f"Received message for URL: {data['url']}")
            print(f"HTML content length: {len(data['html'])}")
            print(f"Timestamp: {data['timestamp']}")
-            # Parse the ISO timestamp correctly
+            # Parse the URL and check if domain should be ignored
            domain = urlparse(data['url']).netloc
            if config.is_domain_ignored(domain):
                logger.info(f"Ignoring domain: {domain}")
                await websocket.send_json({
                    "status": "ignored",
                    "message": f"Domain {domain} is in ignore list"
                })
                continue
            logger.info(f"Processing page: {data['url']}")
            timestamp = iso8601.parse_date(data['timestamp'])
            # Check if we already have a recent entry for this URL
            existing_entry = db.query(HistoryEntry).filter(
                HistoryEntry.url == data['url'],
                HistoryEntry.visit_time >= timestamp - timedelta(minutes=5)
            ).first()
            if existing_entry:
                print(f"Recent entry exists for URL: {data['url']}")
                await websocket.send_json({
                    "status": "skipped",
                    "message": "Recent entry exists"
                })
                continue
            page_info = PageInfo(
                url=data['url'],
                html=data['html'],
                timestamp=timestamp
            )
            print(f"Created PageInfo object for: {page_info.url}")
-            # Convert HTML to markdown
+            # Debug HTML content
-            print("Converting HTML to markdown...")
+            print(f"HTML content length before processing: {len(page_info.html)}")
            # Extract title
            soup = BeautifulSoup(page_info.html, 'html.parser')
            title = soup.title.string if soup.title else ''
            print(f"Extracted title: {title}")
            # Debug markdown conversion
            print("Starting markdown conversion...")
            cleaned_html = page_reader.clean_html(page_info.html)
            print(f"Cleaned HTML length: {len(cleaned_html)}")
            markdown_content = page_reader.html_to_markdown(page_info.html)
-            print(f"Markdown conversion complete, length: {len(markdown_content) if markdown_content else 0}")
+            print(f"Markdown conversion complete. Content length: {len(markdown_content) if markdown_content else 0}")
-            # Update or create history entry
+            if markdown_content:
-            domain = urlparse(page_info.url).netloc
+                print("First 100 chars of markdown:", markdown_content[:100])
-            print(f"Creating history entry for domain: {domain}")
+            else:
                print("No markdown content generated")
            if not title and not markdown_content:
                print(f"No content extracted from: {page_info.url}")
                await websocket.send_json({
                    "status": "skipped",
                    "message": "No content extracted"
                })
                continue
            # Create history entry
            history_entry = HistoryEntry(
                url=page_info.url,
                title=title,
                visit_time=page_info.timestamp,
                domain=domain,
                markdown_content=markdown_content,
                last_content_update=datetime.now(timezone.utc)
            )
-            print("Saving to database...")
+            # Debug database operation
-            db.add(history_entry)
+            print(f"Saving entry with markdown length: {len(markdown_content) if markdown_content else 0}")
            db.commit()
            print("Database save complete")
-            # Send confirmation back to client
+            # Use bulk operations for better performance
-            await websocket.send_json({
+            db.add(history_entry)
-                "status": "success",
+
-                "message": f"Processed page: {page_info.url}"
+            try:
-            })
+                db.commit()
                print(f"Successfully saved entry for: {page_info.url}")
                print(f"Verify markdown content length in database: {len(history_entry.markdown_content) if history_entry.markdown_content else 0}")
                await websocket.send_json({
                    "status": "success",
                    "message": f"Processed page: {page_info.url}"
                })
            except Exception as e:
                db.rollback()
                print(f"Error saving entry: {e}")
                await websocket.send_json({
                    "status": "error",
                    "message": "Database error"
                })
    except WebSocketDisconnect:
-        print("Client disconnected")
+        logger.info("Client disconnected")
    except Exception as e:
-        print(f"Error handling message: {e}")
+        logger.error("Error in WebSocket handler", exc_info=True)
        # Send error back to client if possible
        try:
            await websocket.send_json({
                "status": "error",
                "message": str(e)
            })
        except:
            pass
    finally:
-        print("Cleaning up resources")
+        await page_reader.close()
-        page_reader.close()
+
@app.get("/config/ignored-domains")
 async def get_ignored_domains():
    """Get list of ignored domain patterns"""
    return {"ignored_domains": config.config.get('ignored_domains', [])}
@app.post("/config/ignored-domains")
 async def add_ignored_domain(pattern: str):
    """Add a new domain pattern to ignored list"""
    config.add_ignored_domain(pattern)
    return {"status": "success", "message": f"Added pattern: {pattern}"}
@app.delete("/config/ignored-domains/{pattern}")
 async def remove_ignored_domain(pattern: str):
    """Remove a domain pattern from ignored list"""
    config.remove_ignored_domain(pattern)
    return {"status": "success", "message": f"Removed pattern: {pattern}"}
--- a/app/page_reader.py
+++ b/app/page_reader.py
@@ -4,15 +4,11 @@ from bs4 import BeautifulSoup
 from typing import Optional
 from urllib.parse import urlparse
 from .config import ReaderConfig
-import logging
+from .logging_config import setup_logger
-from .database import SessionLocal, BlacklistedDomain
+from .database import SessionLocal
-# Setup logging with less verbose output
+# Setup logger for this module
-logging.basicConfig(
+logger = setup_logger(__name__)
    level=logging.WARNING,
    format='%(levelname)s: %(message)s'
 )
 logger = logging.getLogger(__name__)
 # Patterns for cleaning
 SCRIPT_PATTERN = r"<[ ]*script.*?\/[ ]*script[ ]*>"
@@ -26,13 +22,15 @@ SVG_PATTERN = r"(<svg[^>]*>)(.*?)(<\/svg>)"
 class PageReader:
    def __init__(self):
        self.config = ReaderConfig()
-        self.db = SessionLocal()
+        logger.info("PageReader initialized")
    def clean_html(self, html: str) -> str:
        """Clean HTML by removing unwanted elements and patterns."""
        if not html:
            logger.warning("Received empty HTML to clean")
            return ""
        logger.debug(f"Cleaning HTML of length: {len(html)}")
        # First use regex to remove problematic patterns
        html = re.sub(SCRIPT_PATTERN, "", html, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL)
        html = re.sub(STYLE_PATTERN, "", html, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL)
@@ -54,12 +52,15 @@ class PageReader:
            ]
            for element in elements_to_remove:
                removed = len(soup.find_all(element))
                if removed:
                    logger.debug(f"Removed {removed} {element} elements")
                for tag in soup.find_all(element):
                    tag.decompose()
            return str(soup)
        except Exception as e:
-            logger.error(f"Error cleaning HTML: {e}")
+            logger.error(f"Error cleaning HTML: {e}", exc_info=True)
            return ""
    def clean_whitespace(self, text: str) -> str:
@@ -80,11 +81,17 @@ class PageReader:
    def html_to_markdown(self, html: str) -> Optional[str]:
        """Convert HTML to markdown."""
        try:
            logger.info("Starting HTML to Markdown conversion")
            logger.debug(f"Input HTML length: {len(html)}")
            cleaned_html = self.clean_html(html)
            logger.debug(f"Cleaned HTML length: {len(cleaned_html)}")
            if not cleaned_html:
                logger.warning("No cleaned HTML content")
                return None
-            return self.clean_whitespace(md(cleaned_html,
+            markdown = self.clean_whitespace(md(cleaned_html,
                                          heading_style="ATX",
                                          bullets="-",
                                          autolinks=True,
@@ -92,10 +99,19 @@ class PageReader:
                                          escape_asterisks=True,
                                          escape_underscores=True))
            logger.debug(f"Generated markdown length: {len(markdown) if markdown else 0}")
            if not markdown or markdown.isspace():
                logger.warning("Markdown is empty or whitespace only")
                return None
            return markdown
        except Exception as e:
-            logger.error(f"Error converting to markdown: {e}")
+            logger.error("Error converting to markdown", exc_info=True)
            return None
-    def close(self):
+    async def close(self):
        """Cleanup resources"""
-        self.db.close()
+        logger.info("Closing PageReader")
        pass  # No need to close DB connection anymore
--- a/app/scheduler.py
+++ b/app/scheduler.py
@@ -7,6 +7,9 @@ from .page_reader import PageReader
 from sqlalchemy import func
 from sqlalchemy.orm import Session
 import pytz
 from .config import Config
 from .database import get_db
 from urllib.parse import urlparse
 class HistoryScheduler:
    def __init__(self):
@@ -14,6 +17,7 @@ class HistoryScheduler:
        self.page_reader = PageReader()
        self.last_history_update = None
        self.content_update_interval = timedelta(hours=24)  # Update content daily
        self.config = Config()
    def _normalize_datetime(self, dt: datetime) -> datetime:
        """Convert datetime to UTC if it has timezone, or make it timezone-aware if it doesn't"""
@@ -28,81 +32,70 @@ class HistoryScheduler:
        return dt.astimezone(pytz.UTC)
    async def update_bookmarks(self):
-        bookmarks = self.browser_collector.fetch_bookmarks()
+        """Update bookmarks from browser"""
        db = SessionLocal()
        try:
-            # First, get all existing URLs to avoid duplicates
+            db = next(get_db())
-            existing_urls = {
+            bookmarks = self.browser_collector.fetch_bookmarks()
-                url: (added_time, folder)
+
-                for url, added_time, folder in
+            for added_time, url, title, folder in bookmarks:  # Unpack the tuple
-                db.query(Bookmark.url, Bookmark.added_time, Bookmark.folder).all()
+                # Extract domain and check if it should be ignored
-            }
+                domain = urlparse(url).netloc
                if self.config.is_domain_ignored(domain):
                    continue
            new_entries = []
            for added_time, url, title, folder in bookmarks:
                # Normalize the datetime
                added_time = self._normalize_datetime(added_time)
-                # Only add if URL doesn't exist or if it's in a different folder
+                # Process the bookmark only if domain is not ignored
-                if (url not in existing_urls or
+                bookmark_entry = Bookmark(
-                    existing_urls[url][1] != folder):
+                    url=url,
-                    domain = self.browser_collector.get_domain(url)
+                    title=title,
-                    entry = Bookmark(
+                    added_time=added_time,
-                        url=url,
+                    folder=folder,
-                        title=title,
+                    domain=domain
-                        added_time=added_time,
+                )
-                        folder=folder,
+                db.add(bookmark_entry)
                        domain=domain
                    )
                    new_entries.append(entry)
-            if new_entries:
+            db.commit()
-                db.bulk_save_objects(new_entries)
+
-                db.commit()
+        except Exception as e:
            print(f"Error updating bookmarks: {e}")
        finally:
            db.close()
    async def update_history(self):
        """Background task to update history periodically"""
        while True:
            db = SessionLocal()
            try:
-                # Get the latest timestamp from our database
+                db = next(get_db())
-                latest_entry = db.query(func.max(HistoryEntry.visit_time)).scalar()
+                history_entries = self.browser_collector.fetch_history()
                if latest_entry:
                    latest_entry = self._normalize_datetime(latest_entry)
-                # Fetch new history
+                for visit_time, url, title in history_entries:  # Unpack the tuple
-                history = self.browser_collector.fetch_history()
+                    # Extract domain and check if it should be ignored
                    domain = urlparse(url).netloc
                    if self.config.is_domain_ignored(domain):
                        continue
                # Filter to only get entries newer than our latest entry
                new_entries = []
                for visit_time, url, title in history:
                    # Normalize the datetime
                    visit_time = self._normalize_datetime(visit_time)
-                    if not latest_entry or visit_time > latest_entry:
+                    # Process the entry only if domain is not ignored
-                        domain = self.browser_collector.get_domain(url)
+                    history_entry = HistoryEntry(
-                        entry = HistoryEntry(
+                        url=url,
-                            url=url,
+                        title=title,
-                            title=title,
+                        visit_time=visit_time,
-                            visit_time=visit_time,
+                        domain=domain
-                            domain=domain
+                    )
-                        )
+                    db.add(history_entry)
                        new_entries.append(entry)
-                if new_entries:
+                db.commit()
                    db.bulk_save_objects(new_entries)
                    db.commit()
                # Update bookmarks
                await self.update_bookmarks()
            except Exception as e:
                print(f"Error updating history: {e}")
            finally:
                db.close()
-            # Wait for 5 minutes before next update
+            await asyncio.sleep(300)  # Wait 5 minutes before next update
            await asyncio.sleep(300)
    async def close(self):
        """Cleanup resources"""
--- a/extension/background.js
+++ b/extension/background.js
@@ -1,5 +1,82 @@
 console.log("Background script loaded");
 class WebSocketClient {
  constructor() {
    console.log("WebSocketClient constructor called");
    this.messageQueue = [];
    this.connect();
    this.reconnectAttempts = 0;
    this.maxReconnectAttempts = 5;
  }
  connect() {
    console.log('Attempting to connect to WebSocket server...');
    try {
      this.ws = new WebSocket('ws://localhost:8523/ws');
      console.log('WebSocket instance created');
      this.ws.addEventListener('open', () => {
        console.log('WebSocket connection opened successfully');
        this.reconnectAttempts = 0;
        this.processQueue();
      });
      this.ws.addEventListener('error', (event) => {
        console.error('WebSocket error occurred:', event);
      });
      this.ws.addEventListener('close', (event) => {
        console.log('WebSocket connection closed:', event.code, event.reason);
        this.tryReconnect();
      });
      this.ws.addEventListener('message', (event) => {
        console.log('Received message from server:', event.data);
      });
    } catch (error) {
      console.error('Error creating WebSocket:', error);
    }
  }
  processQueue() {
    console.log(`Processing message queue (${this.messageQueue.length} messages)`);
    while (this.messageQueue.length > 0) {
      const data = this.messageQueue.shift();
      this.sendMessage(data);
    }
  }
  tryReconnect() {
    if (this.reconnectAttempts < this.maxReconnectAttempts) {
      this.reconnectAttempts++;
      console.log(`Attempting to reconnect (${this.reconnectAttempts}/${this.maxReconnectAttempts})...`);
      setTimeout(() => this.connect(), 2000 * this.reconnectAttempts);
    } else {
      console.log('Max reconnection attempts reached');
    }
  }
  sendMessage(data) {
    if (this.ws.readyState === WebSocket.OPEN) {
      try {
        console.log('Sending data for URL:', data.url);
        this.ws.send(JSON.stringify(data));
        console.log('Data sent successfully');
        return true;
      } catch (error) {
        console.error('Error sending data:', error);
        return false;
      }
    } else {
      console.log('WebSocket not ready, queueing message');
      this.messageQueue.push(data);
      return true;
    }
  }
 }
 const wsClient = new WebSocketClient();
 async function isContentScriptReady(tabId) {
  try {
    await browser.tabs.sendMessage(tabId, { type: "PING" });
@@ -38,9 +115,17 @@ async function sendMessageToTab(tabId) {
  }
 }
 // Listen for messages from content scripts
 browser.runtime.onMessage.addListener((message, sender) => {
  if (message.type === "SEND_PAGE_CONTENT") {
    console.log('Received page content from tab:', sender.tab.id);
    wsClient.sendMessage(message.data);
  }
 });
 browser.webNavigation.onCompleted.addListener(async (details) => {
  console.log("Navigation completed", details);
-  if (details.frameId === 0) { // Only handle main frame navigation
+  if (details.frameId === 0) {
    console.log(`Main frame navigation detected for tab ${details.tabId}`);
    await sendMessageToTab(details.tabId);
  }
--- a/extension/content.js
+++ b/extension/content.js
@@ -1,132 +1,32 @@
 console.log("Content script starting initialization...");
-// Function to log WebSocket state
+function sendPageContent() {
-function getWebSocketState(ws) {
+  const pageContent = {
-  const states = {
+    url: window.location.href,
-    0: 'CONNECTING',
+    html: document.documentElement.outerHTML,
-    1: 'OPEN',
+    timestamp: new Date().toISOString().replace(/\.\d{3}Z$/, 'Z')
    2: 'CLOSING',
    3: 'CLOSED'
  };
-  return states[ws.readyState] || 'UNKNOWN';
+
  browser.runtime.sendMessage({
    type: "SEND_PAGE_CONTENT",
    data: pageContent
  });
 }
-class WebSocketClient {
+// Listen for messages from the background script
  constructor() {
    console.log("WebSocketClient constructor called");
    this.messageQueue = [];
    this.connect();
    this.reconnectAttempts = 0;
    this.maxReconnectAttempts = 5;
  }
  connect() {
    console.log('Attempting to connect to WebSocket server...');
    try {
      this.ws = new WebSocket('ws://localhost:8523/ws');
      console.log('WebSocket instance created');
      this.ws.addEventListener('open', () => {
        console.log('WebSocket connection opened successfully');
        this.reconnectAttempts = 0;
        // Process any queued messages
        this.processQueue();
      });
      this.ws.addEventListener('error', (event) => {
        console.error('WebSocket error occurred:', event);
      });
      this.ws.addEventListener('close', (event) => {
        console.log('WebSocket connection closed:', event.code, event.reason);
        this.tryReconnect();
      });
      this.ws.addEventListener('message', (event) => {
        console.log('Received message from server:', event.data);
      });
    } catch (error) {
      console.error('Error creating WebSocket:', error);
    }
  }
  processQueue() {
    console.log(`Processing message queue (${this.messageQueue.length} messages)`);
    while (this.messageQueue.length > 0) {
      const data = this.messageQueue.shift();
      this.sendMessage(data);
    }
  }
  tryReconnect() {
    if (this.reconnectAttempts < this.maxReconnectAttempts) {
      this.reconnectAttempts++;
      console.log(`Attempting to reconnect (${this.reconnectAttempts}/${this.maxReconnectAttempts})...`);
      setTimeout(() => this.connect(), 2000 * this.reconnectAttempts);
    } else {
      console.log('Max reconnection attempts reached');
    }
  }
  sendMessage(data) {
    console.log('sendMessage called, WebSocket state:', getWebSocketState(this.ws));
    if (this.ws.readyState === WebSocket.OPEN) {
      try {
        console.log('Preparing to send data:', {
          url: data.url,
          timestamp: data.timestamp,
          htmlLength: data.html.length
        });
        this.ws.send(JSON.stringify(data));
        console.log('Data sent successfully');
        return true;
      } catch (error) {
        console.error('Error sending data:', error);
        return false;
      }
    } else {
      console.log('WebSocket not ready, queueing message');
      this.messageQueue.push(data);
      return true;
    }
  }
 }
 console.log("Creating WebSocketClient instance...");
 const wsClient = new WebSocketClient();
 console.log("Setting up message listener...");
 browser.runtime.onMessage.addListener((message, sender, sendResponse) => {
  console.log('Message received from background script:', message);
  if (message.type === "PING") {
    console.log('Received PING, responding...');
    return Promise.resolve({ status: "ready" });
  }
  if (message.type === "GET_PAGE_CONTENT") {
-    console.log('Processing GET_PAGE_CONTENT message');
+    sendPageContent();
    const pageContent = {
      url: window.location.href,
      html: document.documentElement.outerHTML,
      timestamp: new Date().toISOString().replace(/\.\d{3}Z$/, 'Z')
    };
    console.log('Created page content object for:', pageContent.url);
    wsClient.sendMessage(pageContent);
  }
  return true;
 });
 // Send initial page content
-console.log('Sending initial page content...');
+sendPageContent();
 const pageContent = {
  url: window.location.href,
  html: document.documentElement.outerHTML,
  timestamp: new Date().toISOString().replace(/\.\d{3}Z$/, 'Z')
 };
 wsClient.sendMessage(pageContent);
 console.log("Content script initialization complete for:", window.location.href);
--- a/page-reader.py
+++ b/page-reader.py
@@ -1,84 +0,0 @@
 import httpx
 import re
 from markdownify import markdownify as md
 from bs4 import BeautifulSoup
 # Patterns for cleaning
 SCRIPT_PATTERN = r"<[ ]*script.*?\/[ ]*script[ ]*>"
 STYLE_PATTERN = r"<[ ]*style.*?\/[ ]*style[ ]*>"
 META_PATTERN = r"<[ ]*meta.*?>"
 COMMENT_PATTERN = r"<[ ]*!--.*?--[ ]*>"
 LINK_PATTERN = r"<[ ]*link.*?>"
 BASE64_IMG_PATTERN = r'<img[^>]+src="data:image/[^;]+;base64,[^"]+"[^>]*>'
 SVG_PATTERN = r"(<svg[^>]*>)(.*?)(<\/svg>)"
 def clean_html(html: str) -> str:
    """Clean HTML by removing unwanted elements and patterns."""
    # First use regex to remove problematic patterns
    html = re.sub(SCRIPT_PATTERN, "", html, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL)
    html = re.sub(STYLE_PATTERN, "", html, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL)
    html = re.sub(META_PATTERN, "", html, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL)
    html = re.sub(COMMENT_PATTERN, "", html, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL)
    html = re.sub(LINK_PATTERN, "", html, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL)
    html = re.sub(SVG_PATTERN, "", html, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL)
    html = re.sub(BASE64_IMG_PATTERN, "", html)
    # Use BeautifulSoup to remove additional elements we want to strip
    soup = BeautifulSoup(html, 'html.parser')
    # Remove unwanted elements
    elements_to_remove = [
        'canvas', 'img', 'picture', 'audio', 'video',
        'iframe', 'embed', 'object', 'param', 'track',
        'map', 'area', 'source'
    ]
    for element in elements_to_remove:
        for tag in soup.find_all(element):
            tag.decompose()
    return str(soup)
 def get_page_html(url: str) -> str:
    """Fetch HTML content from a given URL using httpx."""
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
    }
    try:
        with httpx.Client(follow_redirects=True) as client:
            response = client.get(url, headers=headers)
            response.raise_for_status()
            return response.text
    except httpx.HTTPError as e:
        print(f"Error fetching page: {e}")
        return ""
 def clean_whitespace(text: str) -> str:
    """Clean excessive whitespace from text, collapsing more than 2 newlines."""
    # Replace 3 or more newlines with 2 newlines
    cleaned = re.sub(r'\n{3,}', '\n\n', text)
    # Remove trailing whitespace from each line
    cleaned = '\n'.join(line.rstrip() for line in cleaned.splitlines())
    return cleaned.strip()
 def html_to_markdown(url: str) -> str:
    """Convert webpage HTML to markdown."""
    html = get_page_html(url)
    if not html:
        return ""
    # Clean the HTML first
    cleaned_html = clean_html(html)
    # Convert to markdown using markdownify
    # Configure markdownify options for clean output
    markdown = md(cleaned_html,
                 heading_style="ATX",  # Use # style headers
                 bullets="-",          # Use - for bullets
                 autolinks=True,       # Convert URLs to links
                 strip=['form'],       # Additional elements to strip
                 escape_asterisks=True,
                 escape_underscores=True)
    # Clean up excessive whitespace
    return clean_whitespace(markdown)
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,9 +2,10 @@ fastapi
 uvicorn
 sqlalchemy
 browser-history
-beautifulsoup4
+beautifulsoup4>=4.9.3
 markdownify
-pyyaml
+pyyaml>=6.0.1
 pytz
 websockets==11.0.3
-iso8601==2.1.0
+iso8601==2.1.0
 lxml>=4.9.3