Refactor to use crawl4ai, uv

2026-03-16 02:49:05 +00:00 · 2025-04-11 22:41:46 -05:00
parent 80516440d7
commit 75a2c51b94
14 changed files with 3559 additions and 648 deletions
--- a/.python-version
+++ b/.python-version
@@ -1 +1 @@
-3.10.6
+3.10.16
--- a/app/config.py
+++ b/app/config.py
@@ -2,6 +2,10 @@ import yaml
 from pathlib import Path
 from typing import Set
 import fnmatch
 import os
 import logging
 logger = logging.getLogger(__name__)
 class Config:
    def __init__(self):
@@ -128,3 +132,120 @@ class ReaderConfig:
                return False
        return True
 DEFAULT_CONFIG_PATH = 'config/reader_config.yaml'
 USER_CONFIG_DIR = os.path.expanduser("~/.config/browser-recall")
 USER_CONFIG_PATH = os.path.join(USER_CONFIG_DIR, 'reader_config.yaml')
 class Config:
    _instance = None
    def __new__(cls, *args, **kwargs):
        if not cls._instance:
            cls._instance = super(Config, cls).__new__(cls)
            cls._instance._initialized = False
        return cls._instance
    def __init__(self, config_path=None):
        if self._initialized:
            return
        self._initialized = True
        self.config_path = self._determine_config_path(config_path)
        self.config_data = self._load_config()
        logger.info(f"Config initialized using: {self.config_path}")
        # Pre-process excluded domains for faster lookup if needed,
        # but direct iteration with fnmatch is often fine for moderate lists.
        self.excluded_domains = self.config_data.get('excluded_domains', [])
        # Ensure it's a list
        if not isinstance(self.excluded_domains, list):
            logger.warning(f"Excluded domains in config is not a list: {self.excluded_domains}. Ignoring.")
            self.excluded_domains = []
    def _determine_config_path(self, provided_path):
        """Determine the correct config path to use."""
        if provided_path and os.path.exists(provided_path):
            return provided_path
        if os.path.exists(USER_CONFIG_PATH):
            return USER_CONFIG_PATH
        if os.path.exists(DEFAULT_CONFIG_PATH):
            return DEFAULT_CONFIG_PATH
        logger.warning("No configuration file found at default or user locations. Using empty config.")
        return None # Indicate no file was found
    def _load_config(self):
        """Loads the YAML configuration file."""
        if not self.config_path:
            return {} # Return empty dict if no config file path determined
        try:
            with open(self.config_path, 'r') as f:
                return yaml.safe_load(f) or {} # Return empty dict if file is empty
        except FileNotFoundError:
            logger.warning(f"Configuration file not found at {self.config_path}. Using default settings.")
            return {}
        except yaml.YAMLError as e:
            logger.error(f"Error parsing configuration file {self.config_path}: {e}")
            return {} # Return empty dict on parsing error
        except Exception as e:
            logger.error(f"Unexpected error loading configuration {self.config_path}: {e}")
            return {}
    def get_config(self):
        """Returns the loaded configuration data."""
        return self.config_data
    def reload_config(self):
        """Reloads the configuration from the file."""
        logger.info(f"Reloading configuration from: {self.config_path}")
        self.config_data = self._load_config()
        self.excluded_domains = self.config_data.get('excluded_domains', [])
        if not isinstance(self.excluded_domains, list):
            logger.warning(f"Excluded domains in reloaded config is not a list: {self.excluded_domains}. Ignoring.")
            self.excluded_domains = []
        logger.info("Configuration reloaded.")
    def is_domain_ignored(self, domain: str) -> bool:
        """
        Checks if a given domain matches any pattern in the excluded_domains list.
        Supports exact matches and wildcard (*) matching using fnmatch.
        """
        if not domain: # Ignore empty domains
            return True
        if not self.excluded_domains: # If list is empty, nothing is ignored
             return False
        # Normalize domain to lowercase for case-insensitive comparison
        domain_lower = domain.lower()
        for pattern in self.excluded_domains:
            if not isinstance(pattern, str): # Skip non-string patterns
                continue
            # Normalize pattern to lowercase
            pattern_lower = pattern.lower()
            # Use fnmatch.fnmatch for wildcard support (*)
            if fnmatch.fnmatch(domain_lower, pattern_lower):
                # logger.debug(f"Domain '{domain}' ignored due to pattern '{pattern}'")
                return True
        return False
    # --- Add methods to get specific config values safely ---
    @property
    def history_update_interval_seconds(self) -> int:
        """Gets the history update interval, defaulting to 300."""
        return self.config_data.get('history_update_interval_seconds', 300)
    @property
    def markdown_update_interval_seconds(self) -> int:
        """Gets the markdown update interval, defaulting to 300."""
        return self.config_data.get('markdown_update_interval_seconds', 300)
    # Add other specific getters as needed
    # Example:
    # @property
    # def some_other_setting(self) -> str:
    #     return self.config_data.get('some_other_setting', 'default_value')
--- a/app/main.py
+++ b/app/main.py
@@ -1,493 +1,293 @@
-from fastapi import FastAPI, Depends, Query, WebSocket, WebSocketDisconnect, HTTPException
+from fastapi import FastAPI, Depends
 from sqlalchemy.orm import Session
-from datetime import datetime, timezone, timedelta
+from datetime import datetime, timezone
-from typing import List, Optional
+from typing import Optional
 import asyncio
 from fastapi import WebSocketDisconnect
 from urllib.parse import urlparse
 import pytz
 from fastapi.middleware.cors import CORSMiddleware
 import iso8601
 from bs4 import BeautifulSoup
 from sqlalchemy import text
 from sqlalchemy.sql import text
 from .logging_config import setup_logger
 from fastapi.templating import Jinja2Templates
 from fastapi.staticfiles import StaticFiles
 from fastapi import Request
 import browser_history
 from crawl4ai import AsyncWebCrawler
 # Local imports
 from .logging_config import setup_logger
 from .database import (
    get_db,
    HistoryEntry,
    Bookmark,
    get_last_processed_timestamp,
    update_last_processed_timestamp,
    create_tables,
    engine,
-    recreate_fts_tables
+    # recreate_fts_tables # Keep if needed, but often done manually or via migration tool
 )
 from .scheduler import HistoryScheduler
 from .page_info import PageInfo
 from .page_reader import PageReader
 from .config import Config
-from sqlalchemy.ext.declarative import declarative_base
+
 # Import Routers
 from .routers import history, bookmarks, config as api_config, websocket, ui
 logger = setup_logger(__name__)
-app = FastAPI()
+# --- Global Variables ---
-scheduler = HistoryScheduler()
+# These are accessed by other modules (like websocket router)
-config = Config()
+# Consider using app state or dependency injection for cleaner management if complexity grows
 config_manager = Config() # Renamed to avoid conflict with router import
 crawler: Optional[AsyncWebCrawler] = None
-# Add CORS middleware to allow WebSocket connections
+# Import scheduler *after* crawler is defined
 from .scheduler import HistoryScheduler
 scheduler: Optional[HistoryScheduler] = None # Now initialize scheduler variable
 # --- FastAPI App Initialization ---
 app = FastAPI(title="Browser History Search API")
 # Add CORS middleware
 app.add_middleware(
    CORSMiddleware,
-    allow_origins=["*"],  # In production, specify your domains
+    allow_origins=["*"],  # Adjust in production
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
 )
-templates = Jinja2Templates(directory="app/templates")
+# Mount static files and templates
 app.mount("/static", StaticFiles(directory="app/static"), name="static")
 # Note: Templates are used within the ui router now, no need for global instance here unless used elsewhere
-@app.on_event("startup")
+# --- Helper Function (Initial Sync) ---
 async def startup_event():
    logger.info("Starting application")
    try:
        # First create the base tables
        logger.info("Creating base tables...")
        create_tables()
        # # Drop and recreate FTS tables
        # logger.info("Recreating FTS tables...")
        # with engine.connect() as conn:
        #     # First check if the main history table exists
        #     result = conn.execute(text(
        #         "SELECT name FROM sqlite_master WHERE type='table' AND name='history'"
        #     )).fetchone()
        #     if not result:
        #         logger.info("Main history table doesn't exist yet, creating tables...")
        #         Base.metadata.create_all(bind=engine)
        #     # Now recreate FTS tables
        #     logger.info("Dropping and recreating FTS tables...")
        #     recreate_fts_tables()
        #     logger.info("FTS tables recreation completed")
        # Initial history and bookmark fetch
        logger.info("Processing initial browser history...")
        process_browser_history()
        logger.info("Updating bookmarks...")
        await scheduler.update_bookmarks()
        # Start the background tasks
        logger.info("Starting background tasks...")
        asyncio.create_task(scheduler.update_history())
        logger.info("Startup completed successfully")
    except Exception as e:
        logger.error(f"Error during startup: {str(e)}", exc_info=True)
        raise
 def serialize_history_entry(entry, include_content: bool = False):
    """Serialize a HistoryEntry object to a dictionary"""
    # Handle both ORM objects and raw SQL results
    if hasattr(entry, '_mapping'):  # Raw SQL result
        result = {
            "id": entry.id,
            "url": entry.url,
            "title": entry.title,
            "visit_time": entry.visit_time.isoformat() if isinstance(entry.visit_time, datetime) else entry.visit_time,
            "domain": entry.domain,
        }
    else:  # ORM object
        result = {
            "id": entry.id,
            "url": entry.url,
            "title": entry.title,
            "visit_time": entry.visit_time.isoformat() if entry.visit_time else None,
            "domain": entry.domain,
        }
    if include_content:
        result["markdown_content"] = entry.markdown_content
    return result
 def serialize_bookmark(bookmark):
    """Serialize a Bookmark object to a dictionary"""
    return {
        "id": bookmark.id,
        "url": bookmark.url,
        "title": bookmark.title,
        "added_time": bookmark.added_time.isoformat() if bookmark.added_time else None,
        "folder": bookmark.folder,
        "domain": bookmark.domain,
    }
@app.get("/history/search")
 async def search_history(
    query: Optional[str] = Query(None),
    domain: Optional[str] = Query(None),
    start_date: Optional[str] = Query(None),
    end_date: Optional[str] = Query(None),
    include_content: bool = Query(False),
    db: Session = Depends(get_db)
 ):
    """Search history using FTS5"""
    try:
        if query:
            # Build the FTS query
            fts_conditions = [f'title:{query}* OR markdown_content:{query}*']
            params = {'query': query}
            if domain:
                fts_conditions.append(f'domain:"{domain}"')
            fts_query = ' AND '.join(fts_conditions)
            # Build the SQL query
            sql = """
                SELECT
                    h.*,
                    bm25(history_fts) as rank,
                    highlight(history_fts, 0, '<mark>', '</mark>') as title_highlight,
                    highlight(history_fts, 1, '<mark>', '</mark>') as content_highlight
                FROM history_fts
                JOIN history h ON history_fts.rowid = h.id
                WHERE history_fts MATCH :fts_query
            """
            # Add date filters if provided
            if start_date:
                sql += " AND h.visit_time >= :start_date"
                params['start_date'] = start_date
            if end_date:
                sql += " AND h.visit_time <= :end_date"
                params['end_date'] = end_date
            sql += " ORDER BY rank, h.visit_time DESC LIMIT 100"
            params['fts_query'] = fts_query
            results = db.execute(text(sql), params).fetchall()
            return [serialize_history_entry(row, include_content) for row in results]
        else:
            # Handle non-search queries
            query = db.query(HistoryEntry)
            if domain:
                query = query.filter(HistoryEntry.domain == domain)
            if start_date:
                query = query.filter(HistoryEntry.visit_time >= start_date)
            if end_date:
                query = query.filter(HistoryEntry.visit_time <= end_date)
            entries = query.order_by(HistoryEntry.visit_time.desc()).limit(100).all()
            return [serialize_history_entry(entry, include_content) for entry in entries]
    except Exception as e:
        logger.error(f"Search error: {str(e)}", exc_info=True)
        raise HTTPException(
            status_code=500,
            detail={"message": "Search operation failed", "error": str(e)}
        )
@app.get("/bookmarks/search")
 async def search_bookmarks(
    domain: Optional[str] = Query(None),
    folder: Optional[str] = Query(None),
    search_term: Optional[str] = Query(None),
    db: Session = Depends(get_db)
 ):
    """Search bookmarks with optimized queries"""
    try:
        # Build query efficiently
        query = db.query(Bookmark)
        # Apply filters using index-optimized queries
        if domain:
            query = query.filter(Bookmark.domain == domain)
        if folder:
            query = query.filter(Bookmark.folder == folder)
        if search_term:
            # Use LIKE with index hint for title search
            search_pattern = f"%{search_term}%"
            query = query.filter(
                Bookmark.title.ilike(search_pattern)
            ).with_hint(
                Bookmark,
                'INDEXED BY ix_bookmarks_title',
                'sqlite'
            )
        # Add ordering and limit for better performance
        bookmarks = query.order_by(Bookmark.added_time.desc()).limit(1000).all()
        return [serialize_bookmark(bookmark) for bookmark in bookmarks]
    except Exception as e:
        print(f"Bookmark search error: {e}")
        raise HTTPException(status_code=500, detail="Search operation failed")
 # Add new endpoint for advanced full-text search
@app.get("/history/search/advanced")
 async def advanced_history_search(
    query: str = Query(..., description="Full-text search query with SQLite FTS5 syntax"),
    include_content: bool = Query(False),
    db: Session = Depends(get_db)
 ):
    """Advanced full-text search using SQLite FTS5 features"""
    try:
        # Use raw SQL for advanced FTS query
        fts_query = """
            SELECT h.*, rank
            FROM history h
            INNER JOIN history_fts f ON h.id = f.rowid
            WHERE history_fts MATCH :query
            ORDER BY rank
            LIMIT 1000
        """
        results = db.execute(text(fts_query), {'query': query}).all()
        # Convert results to HistoryEntry objects
        entries = [
            serialize_history_entry(
                HistoryEntry(
                    id=row.id,
                    url=row.url,
                    title=row.title,
                    visit_time=row.visit_time,
                    domain=row.domain,
                    markdown_content=row.markdown_content if include_content else None
                ),
                include_content
            )
            for row in results
        ]
        return entries
    except Exception as e:
        print(f"Advanced search error: {e}")
        raise HTTPException(status_code=500, detail="Advanced search operation failed")
@app.websocket("/ws")
 async def websocket_endpoint(websocket: WebSocket, db: Session = Depends(get_db)):
    logger.info("New WebSocket connection established")
    page_reader = PageReader()
    await websocket.accept()
    try:
        while True:
            data = await websocket.receive_json()
            # Parse the URL and check if domain should be ignored
            domain = urlparse(data['url']).netloc
            if config.is_domain_ignored(domain):
                logger.info(f"Ignoring domain: {domain}")
                await websocket.send_json({
                    "status": "ignored",
                    "message": f"Domain {domain} is in ignore list"
                })
                continue
            logger.info(f"Processing page: {data['url']}")
            timestamp = iso8601.parse_date(data['timestamp'])
            # Check if we already have a recent entry for this URL
            existing_entry = db.query(HistoryEntry).filter(
                HistoryEntry.url == data['url'],
                HistoryEntry.visit_time >= timestamp - timedelta(minutes=5)
            ).first()
            if existing_entry:
                print(f"Recent entry exists for URL: {data['url']}")
                await websocket.send_json({
                    "status": "skipped",
                    "message": "Recent entry exists"
                })
                continue
            page_info = PageInfo(
                url=data['url'],
                html=data['html'],
                timestamp=timestamp
            )
            # Debug HTML content
            print(f"HTML content length before processing: {len(page_info.html)}")
            # Extract title
            soup = BeautifulSoup(page_info.html, 'html.parser')
            title = soup.title.string if soup.title else ''
            print(f"Extracted title: {title}")
            # Debug markdown conversion
            print("Starting markdown conversion...")
            cleaned_html = page_reader.clean_html(page_info.html)
            print(f"Cleaned HTML length: {len(cleaned_html)}")
            markdown_content = page_reader.html_to_markdown(page_info.html)
            print(f"Markdown conversion complete. Content length: {len(markdown_content) if markdown_content else 0}")
            if markdown_content:
                print("First 100 chars of markdown:", markdown_content[:100])
            else:
                print("No markdown content generated")
            if not title and not markdown_content:
                print(f"No content extracted from: {page_info.url}")
                await websocket.send_json({
                    "status": "skipped",
                    "message": "No content extracted"
                })
                continue
            # Create history entry
            history_entry = HistoryEntry(
                url=page_info.url,
                title=title,
                visit_time=page_info.timestamp,
                domain=domain,
                markdown_content=markdown_content,
                last_content_update=datetime.now(timezone.utc)
            )
            # Debug database operation
            print(f"Saving entry with markdown length: {len(markdown_content) if markdown_content else 0}")
            # Use bulk operations for better performance
            db.add(history_entry)
            try:
                db.commit()
                print(f"Successfully saved entry for: {page_info.url}")
                print(f"Verify markdown content length in database: {len(history_entry.markdown_content) if history_entry.markdown_content else 0}")
                await websocket.send_json({
                    "status": "success",
                    "message": f"Processed page: {page_info.url}"
                })
            except Exception as e:
                db.rollback()
                print(f"Error saving entry: {e}")
                await websocket.send_json({
                    "status": "error",
                    "message": "Database error"
                })
    except WebSocketDisconnect:
        logger.info("Client disconnected")
    except Exception as e:
        logger.error("Error in WebSocket handler", exc_info=True)
    finally:
        await page_reader.close()
@app.get("/config/ignored-domains")
 async def get_ignored_domains():
    """Get list of ignored domain patterns"""
    return {"ignored_domains": config.config.get('ignored_domains', [])}
@app.post("/config/ignored-domains")
 async def add_ignored_domain(pattern: str):
    """Add a new domain pattern to ignored list"""
    config.add_ignored_domain(pattern)
    return {"status": "success", "message": f"Added pattern: {pattern}"}
@app.delete("/config/ignored-domains/{pattern}")
 async def remove_ignored_domain(pattern: str):
    """Remove a domain pattern from ignored list"""
    config.remove_ignored_domain(pattern)
    return {"status": "success", "message": f"Removed pattern: {pattern}"}
@app.get("/")
 async def home(request: Request, db: Session = Depends(get_db)):
    # Get recent history entries
    entries = db.query(HistoryEntry)\
        .order_by(HistoryEntry.visit_time.desc())\
        .limit(50)\
        .all()
    return templates.TemplateResponse(
        "index.html",
        {"request": request, "entries": entries}
    )
@app.get("/search")
 async def search_page(request: Request):
    return templates.TemplateResponse(
        "search.html",
        {"request": request}
    )
@app.get("/bookmarks")
 async def bookmarks_page(request: Request, db: Session = Depends(get_db)):
    bookmarks = db.query(Bookmark)\
        .order_by(Bookmark.added_time.desc())\
        .limit(50)\
        .all()
    return templates.TemplateResponse(
        "bookmarks.html",
        {"request": request, "bookmarks": bookmarks}
    )
 def process_browser_history():
    """Fetches and stores new history entries from browser_history library (Initial Sync)."""
    try:
-        logger.info("Starting browser history processing")
+        logger.info("Starting browser history processing (initial sync)")
        outputs = browser_history.get_history()
-        history_list = outputs.histories  # This is a list of tuples (timestamp, url, title)
+        # browser_history returns platform specific History object, get histories list
-        logger.info(f"Found {len(history_list)} total history items")
+        history_list = []
        if hasattr(outputs, 'histories') and outputs.histories:
             history_list = outputs.histories # List of (datetime, url, title)
        else:
             logger.warning("Could not retrieve histories list from browser_history output.")
             return # Exit if no history list found
-        current_timestamp = int(datetime.now().timestamp())
+        logger.info(f"Found {len(history_list)} total history items from browser_history library")
        source_key = "browser_history"  # Single source since we get combined history
        last_timestamp = get_last_processed_timestamp(source_key)
-        logger.info(f"Last processed timestamp: {last_timestamp}")
+        current_timestamp_dt = datetime.now(timezone.utc)
        current_timestamp = int(current_timestamp_dt.timestamp()) # Use timezone-aware timestamp
        source_key = "browser_history_sync" # Differentiate from scheduler source
        last_timestamp = get_last_processed_timestamp(source_key) or 0 # Ensure it's 0 if None
-        # Filter for only new entries
+        logger.info(f"Last processed timestamp for initial sync '{source_key}': {last_timestamp}")
        new_entries = [
            entry for entry in history_list
            if entry[0].timestamp() > last_timestamp
        ]
-        logger.info(f"Found {len(new_entries)} new entries")
+        new_entries = []
        processed_urls_times = set() # Avoid duplicates within the batch
        for entry in history_list:
            # Basic validation of entry structure
            if not isinstance(entry, (tuple, list)) or len(entry) < 2:
                logger.warning(f"Skipping malformed history entry: {entry}")
                continue
            timestamp, url = entry[0], entry[1]
            title = entry[2] if len(entry) > 2 else "" # Handle optional title
            if not url or not timestamp:
                logger.warning(f"Skipping entry with missing URL or timestamp: Title='{title}'")
                continue
            # Ensure timestamp is datetime object
            if not isinstance(timestamp, datetime):
                 logger.warning(f"Skipping entry with non-datetime timestamp ({type(timestamp)}): {url}")
                 continue
            # Normalize timestamp (Assume local if naive, convert to UTC)
            if timestamp.tzinfo is None or timestamp.tzinfo.utcoffset(timestamp) is None:
                try:
                    timestamp_aware = timestamp.astimezone() # Make aware using system local
                except Exception as tz_err:
                     logger.warning(f"Could not determine local timezone for naive timestamp {timestamp}. Assuming UTC. Error: {tz_err}")
                     timestamp_aware = timestamp.replace(tzinfo=timezone.utc) # Fallback to UTC
            else:
                timestamp_aware = timestamp
            timestamp_utc = timestamp_aware.astimezone(timezone.utc)
            # Filter for only new entries based on normalized UTC timestamp
            if timestamp_utc.timestamp() > last_timestamp:
                entry_key = (url, timestamp_utc.timestamp())
                if entry_key in processed_urls_times:
                    continue # Skip duplicate within this batch
                new_entries.append((timestamp_utc, url, title))
                processed_urls_times.add(entry_key)
        logger.info(f"Found {len(new_entries)} new entries for initial sync after filtering")
        if new_entries:
-            for timestamp, url, title in new_entries:
+            added_count = 0
-                logger.info(f"Processing entry: {timestamp} - {url}")
+            skipped_ignored = 0
            # Use context manager for session
            with next(get_db()) as db:
                try:
                    for timestamp_utc, url, title in new_entries:
                        domain = urlparse(url).netloc
-                if config.is_domain_ignored(domain):
+                        if config_manager.is_domain_ignored(domain):
-                    logger.debug(f"Skipping ignored domain: {domain}")
+                            # logger.debug(f"Skipping ignored domain during initial sync: {domain}")
                            skipped_ignored += 1
                            continue
-                # Create history entry
+                        # Optional: Check if entry already exists more robustly
-                db = next(get_db())
+                        # existing = db.query(HistoryEntry.id).filter(HistoryEntry.url == url, HistoryEntry.visit_time == timestamp_utc).first()
-                try:
+                        # if existing:
                        #     continue
                        history_entry = HistoryEntry(
                            url=url,
-                        title=title,
+                            title=title or "", # Ensure title is not None
-                        visit_time=timestamp,
+                            visit_time=timestamp_utc,
                            domain=domain
                            # Note: No markdown content here, only basic history
                        )
                        db.add(history_entry)
                        added_count += 1
                    if added_count > 0:
                        db.commit()
-                except Exception as e:
+                        logger.info(f"Committed {added_count} new history entries from initial sync.")
-                    logger.error(f"Error storing history item: {str(e)}")
+                        # Update the last processed timestamp only if successful commit
                    db.rollback()
                finally:
                    db.close()
            # Update the last processed timestamp
                        update_last_processed_timestamp(source_key, current_timestamp)
-            logger.info(f"Updated timestamp to {current_timestamp}")
+                        logger.info(f"Updated initial sync timestamp for '{source_key}' to {current_timestamp}")
                    else:
                         logger.info("No new unique entries to commit during initial sync.")
                         # Update timestamp even if nothing new added, to mark sync time
                         update_last_processed_timestamp(source_key, current_timestamp)
                         logger.info(f"Updated initial sync timestamp check for '{source_key}' to {current_timestamp}")
-        logger.info(f"Processed {len(new_entries)} new items")
+
                    if skipped_ignored > 0:
                        logger.info(f"Skipped {skipped_ignored} entries due to ignored domains during initial sync.")
                except Exception as e:
-        logger.error(f"Error processing browser history: {str(e)}", exc_info=True)
+                    logger.error(f"Error storing history item during initial sync: {str(e)}", exc_info=True)
                    db.rollback()
        else:
             logger.info("No new history entries found during initial sync.")
             # Update timestamp even if nothing new found, to mark sync time
             update_last_processed_timestamp(source_key, current_timestamp)
             logger.info(f"Updated initial sync timestamp check for '{source_key}' to {current_timestamp}")
    except ImportError:
         logger.warning("`browser_history` library not found or import failed. Skipping initial sync.")
    except Exception as e:
        logger.error(f"Error processing browser history during initial sync: {str(e)}", exc_info=True)
 # --- Startup and Shutdown Events ---
@app.on_event("startup")
 async def startup_event():
    global crawler, scheduler # Allow modification of globals
    logger.info("Starting application initialization...")
    try:
        # 1. Ensure base tables exist
        logger.info("Ensuring base tables exist...")
        create_tables()
        # 2. Initialize the crawler
        logger.info("Initializing AsyncWebCrawler...")
        if crawler is None:
             crawler = AsyncWebCrawler()
        logger.info("AsyncWebCrawler initialized.")
        # 3. Initialize the Scheduler *after* the crawler
        logger.info("Initializing HistoryScheduler...")
        if scheduler is None:
            scheduler = HistoryScheduler(crawler=crawler) # Pass crawler instance
        logger.info("HistoryScheduler initialized.")
        # 4. Perform initial history sync from browser_history library
        logger.info("Performing initial browser history sync...")
        process_browser_history() # Sync history not processed before
        # 5. Perform initial bookmark sync (using scheduler's method)
        # Run in background to avoid blocking startup if it takes long
        logger.info("Starting initial bookmark sync task...")
        asyncio.create_task(scheduler.update_bookmarks())
        # 6. Start background tasks (scheduler for ongoing updates)
        logger.info("Starting background history update task...")
        asyncio.create_task(scheduler.update_history())
        # --- Markdown Update Tasks ---
        # 7a. Trigger ONE initial batch processing run in the background
        logger.info("Starting initial markdown processing batch task...")
        asyncio.create_task(scheduler._process_markdown_batch()) # Run one batch now
        # 7b. Start the PERIODIC background markdown update task
        logger.info("Starting periodic background markdown update task...")
        # Use the renamed method for the loop
        asyncio.create_task(scheduler.update_missing_markdown_periodically())
        # --- End Markdown Update Tasks ---
        logger.info("Application startup sequence initiated. Background tasks running.")
    except Exception as e:
        logger.error(f"FATAL ERROR during application startup: {str(e)}", exc_info=True)
        raise RuntimeError(f"Application startup failed: {e}") from e
@app.on_event("shutdown")
 async def shutdown_event():
    global crawler, scheduler
    logger.info("Starting application shutdown...")
    # Stop scheduler tasks gracefully if possible (implement cancellation in tasks if needed)
    # For now, we just close resources
    # Close scheduler resources
    if scheduler and hasattr(scheduler, 'close'):
         try:
             logger.info("Closing scheduler resources...")
             await scheduler.close() # Call the scheduler's close method
         except Exception as e:
             logger.error(f"Error closing scheduler: {e}", exc_info=True)
    # Close crawler if needed (check crawl4ai docs for explicit close method)
    # Based on previous code, seems no explicit close needed, but keep check just in case
    if crawler and hasattr(crawler, 'aclose'):
        try:
            logger.info("Closing AsyncWebCrawler...")
            # await crawler.aclose() # Example if an async close exists
        except Exception as e:
            logger.error(f"Error closing crawler: {e}", exc_info=True)
    # Close database engine connections if necessary (usually handled automatically by SQLAlchemy)
    # if engine and hasattr(engine, 'dispose'): # Check if using async engine that needs dispose
    #    await engine.dispose()
    logger.info("Application shutdown complete.")
 # --- Include Routers ---
 app.include_router(history.router)
 app.include_router(bookmarks.router)
 app.include_router(api_config.router)
 app.include_router(websocket.router)
 app.include_router(ui.router)
 # Optional: Add a root endpoint for health check or basic info
@app.get("/health", tags=["service"])
 async def health_check():
    # Extended health check could verify DB connection or task status
    db_ok = False
    try:
        with next(get_db()) as db:
            db.execute("SELECT 1")
            db_ok = True
    except Exception:
        db_ok = False
    return {
        "status": "ok",
        "database_connection": "ok" if db_ok else "error",
        # Add other checks as needed
    }
--- a/app/page_reader.py
+++ b/app/page_reader.py
@@ -1,117 +0,0 @@
 import re
 from markdownify import markdownify as md
 from bs4 import BeautifulSoup
 from typing import Optional
 from urllib.parse import urlparse
 from .config import ReaderConfig
 from .logging_config import setup_logger
 from .database import SessionLocal
 # Setup logger for this module
 logger = setup_logger(__name__)
 # Patterns for cleaning
 SCRIPT_PATTERN = r"<[ ]*script.*?\/[ ]*script[ ]*>"
 STYLE_PATTERN = r"<[ ]*style.*?\/[ ]*style[ ]*>"
 META_PATTERN = r"<[ ]*meta.*?>"
 COMMENT_PATTERN = r"<[ ]*!--.*?--[ ]*>"
 LINK_PATTERN = r"<[ ]*link.*?>"
 BASE64_IMG_PATTERN = r'<img[^>]+src="data:image/[^;]+;base64,[^"]+"[^>]*>'
 SVG_PATTERN = r"(<svg[^>]*>)(.*?)(<\/svg>)"
 class PageReader:
    def __init__(self):
        self.config = ReaderConfig()
        logger.info("PageReader initialized")
    def clean_html(self, html: str) -> str:
        """Clean HTML by removing unwanted elements and patterns."""
        if not html:
            logger.warning("Received empty HTML to clean")
            return ""
        logger.debug(f"Cleaning HTML of length: {len(html)}")
        # First use regex to remove problematic patterns
        html = re.sub(SCRIPT_PATTERN, "", html, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL)
        html = re.sub(STYLE_PATTERN, "", html, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL)
        html = re.sub(META_PATTERN, "", html, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL)
        html = re.sub(COMMENT_PATTERN, "", html, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL)
        html = re.sub(LINK_PATTERN, "", html, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL)
        html = re.sub(SVG_PATTERN, "", html, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL)
        html = re.sub(BASE64_IMG_PATTERN, "", html)
        try:
            # Use BeautifulSoup to remove additional elements we want to strip
            soup = BeautifulSoup(html, 'html.parser')
            # Remove unwanted elements
            elements_to_remove = [
                'canvas', 'img', 'picture', 'audio', 'video',
                'iframe', 'embed', 'object', 'param', 'track',
                'map', 'area', 'source'
            ]
            for element in elements_to_remove:
                removed = len(soup.find_all(element))
                if removed:
                    logger.debug(f"Removed {removed} {element} elements")
                for tag in soup.find_all(element):
                    tag.decompose()
            return str(soup)
        except Exception as e:
            logger.error(f"Error cleaning HTML: {e}", exc_info=True)
            return ""
    def clean_whitespace(self, text: str) -> str:
        """Clean excessive whitespace from text."""
        if not text:
            return ""
        try:
            # Replace 3 or more newlines with 2 newlines
            cleaned = re.sub(r'\n{3,}', '\n\n', text)
            # Remove trailing whitespace from each line
            cleaned = '\n'.join(line.rstrip() for line in cleaned.splitlines())
            return cleaned.strip()
        except Exception as e:
            logger.error(f"Error cleaning whitespace: {e}")
            return ""
    def html_to_markdown(self, html: str) -> Optional[str]:
        """Convert HTML to markdown."""
        try:
            logger.info("Starting HTML to Markdown conversion")
            logger.debug(f"Input HTML length: {len(html)}")
            cleaned_html = self.clean_html(html)
            logger.debug(f"Cleaned HTML length: {len(cleaned_html)}")
            if not cleaned_html:
                logger.warning("No cleaned HTML content")
                return None
            markdown = self.clean_whitespace(md(cleaned_html,
                                          heading_style="ATX",
                                          bullets="-",
                                          autolinks=True,
                                          strip=['form'],
                                          escape_asterisks=True,
                                          escape_underscores=True))
            logger.debug(f"Generated markdown length: {len(markdown) if markdown else 0}")
            if not markdown or markdown.isspace():
                logger.warning("Markdown is empty or whitespace only")
                return None
            return markdown
        except Exception as e:
            logger.error("Error converting to markdown", exc_info=True)
            return None
    async def close(self):
        """Cleanup resources"""
        logger.info("Closing PageReader")
        pass  # No need to close DB connection anymore
--- a/app/routers/bookmarks.py
+++ b/app/routers/bookmarks.py
@@ -0,0 +1,47 @@
 from fastapi import APIRouter, Depends, Query, HTTPException
 from sqlalchemy.orm import Session
 from typing import List, Optional
 from ..database import get_db, Bookmark
 from ..utils import serialize_bookmark
 from ..logging_config import setup_logger
 logger = setup_logger(__name__)
 router = APIRouter(prefix="/bookmarks", tags=["bookmarks"])
@router.get("/search")
 async def search_bookmarks(
    domain: Optional[str] = Query(None),
    folder: Optional[str] = Query(None),
    search_term: Optional[str] = Query(None),
    db: Session = Depends(get_db)
 ):
    """Search bookmarks with optimized queries"""
    try:
        # Build query efficiently
        query = db.query(Bookmark)
        # Apply filters using index-optimized queries
        if domain:
            query = query.filter(Bookmark.domain == domain)
        if folder:
            query = query.filter(Bookmark.folder == folder)
        if search_term:
            # Use LIKE for title search (consider FTS for bookmarks if needed)
            search_pattern = f"%{search_term}%"
            query = query.filter(Bookmark.title.ilike(search_pattern))
            # Removed index hint as SQLAlchemy/SQLite usually handles this well with LIKE
        # Add ordering and limit for better performance
        bookmarks = query.order_by(Bookmark.added_time.desc()).limit(1000).all()
        return [serialize_bookmark(bookmark) for bookmark in bookmarks]
    except Exception as e:
        logger.error(f"Bookmark search error: {e}", exc_info=True)
        raise HTTPException(
            status_code=500,
            detail={"message": "Bookmark search operation failed", "error": str(e)}
        )
--- a/app/routers/config.py
+++ b/app/routers/config.py
@@ -0,0 +1,43 @@
 from fastapi import APIRouter, Depends, HTTPException
 from typing import List
 from ..config import Config
 from ..logging_config import setup_logger
 logger = setup_logger(__name__)
 router = APIRouter(prefix="/config", tags=["config"])
 # Assuming config is a singleton or easily accessible
 # If not, you might need to use Depends or app state
 config = Config()
@router.get("/ignored-domains")
 async def get_ignored_domains():
    """Get list of ignored domain patterns"""
    try:
        return {"ignored_domains": config.config.get('ignored_domains', [])}
    except Exception as e:
        logger.error(f"Error getting ignored domains: {e}", exc_info=True)
        raise HTTPException(status_code=500, detail="Failed to retrieve ignored domains")
@router.post("/ignored-domains")
 async def add_ignored_domain(pattern: str):
    """Add a new domain pattern to ignored list"""
    try:
        config.add_ignored_domain(pattern)
        return {"status": "success", "message": f"Added pattern: {pattern}"}
    except Exception as e:
        logger.error(f"Error adding ignored domain '{pattern}': {e}", exc_info=True)
        raise HTTPException(status_code=500, detail="Failed to add ignored domain")
@router.delete("/ignored-domains/{pattern}")
 async def remove_ignored_domain(pattern: str):
    """Remove a domain pattern from ignored list"""
    try:
        config.remove_ignored_domain(pattern)
        return {"status": "success", "message": f"Removed pattern: {pattern}"}
    except Exception as e:
        logger.error(f"Error removing ignored domain '{pattern}': {e}", exc_info=True)
        raise HTTPException(status_code=500, detail="Failed to remove ignored domain")
--- a/app/routers/history.py
+++ b/app/routers/history.py
@@ -0,0 +1,132 @@
 from fastapi import APIRouter, Depends, Query, HTTPException
 from sqlalchemy.orm import Session
 from sqlalchemy import text
 from typing import List, Optional
 from ..database import get_db, HistoryEntry
 from ..utils import serialize_history_entry
 from ..logging_config import setup_logger
 logger = setup_logger(__name__)
 router = APIRouter(prefix="/history", tags=["history"])
@router.get("/search")
 async def search_history(
    query: Optional[str] = Query(None),
    domain: Optional[str] = Query(None),
    start_date: Optional[str] = Query(None),
    end_date: Optional[str] = Query(None),
    include_content: bool = Query(False),
    db: Session = Depends(get_db)
 ):
    """Search history using FTS5"""
    try:
        if query:
            # Build the FTS query
            # Basic query sanitization/escaping might be needed depending on FTS syntax usage
            # For simple term search, this is okay. For complex FTS syntax, more care is needed.
            fts_conditions = []
            params = {}
            # Handle different query parts (title, content, domain)
            # Example: "term1 title:term2 domain:example.com"
            # This requires more sophisticated parsing. For now, assume simple query applies to title/content.
            # A safer approach for user input:
            sanitized_query = query.replace('"', '""') # Basic FTS escaping for quotes
            fts_match_expr = f'(title : "{sanitized_query}"* OR markdown_content : "{sanitized_query}"*)'
            params['fts_query'] = fts_match_expr
            if domain:
                # Add domain filtering directly in FTS if possible and indexed
                # Assuming 'domain' is an indexed column in FTS table
                # params['fts_query'] += f' AND domain : "{domain}"' # Adjust FTS syntax if needed
                # Or filter after FTS search if domain isn't in FTS index efficiently
                 pass # Domain filtering will be added later if needed
            # Build the SQL query
            sql = """
                SELECT
                    h.*,
                    bm25(history_fts) as rank,
                    highlight(history_fts, 0, '<mark>', '</mark>') as title_highlight,
                    highlight(history_fts, 1, '<mark>', '</mark>') as content_highlight
                FROM history_fts
                JOIN history h ON history_fts.rowid = h.id
                WHERE history_fts MATCH :fts_query
            """
            # Add domain filter as a regular WHERE clause if not in FTS MATCH
            if domain:
                sql += " AND h.domain = :domain"
                params['domain'] = domain
            # Add date filters if provided
            if start_date:
                sql += " AND h.visit_time >= :start_date"
                params['start_date'] = start_date
            if end_date:
                sql += " AND h.visit_time <= :end_date"
                params['end_date'] = end_date
            sql += " ORDER BY rank DESC, h.visit_time DESC LIMIT 100" # Rank usually descends
            results = db.execute(text(sql), params).fetchall()
            # Use the updated serializer that handles potential highlight/rank fields
            return [serialize_history_entry(row, include_content) for row in results]
        else:
            # Handle non-search queries (basic filtering)
            query_builder = db.query(HistoryEntry)
            if domain:
                query_builder = query_builder.filter(HistoryEntry.domain == domain)
            if start_date:
                query_builder = query_builder.filter(HistoryEntry.visit_time >= start_date)
            if end_date:
                query_builder = query_builder.filter(HistoryEntry.visit_time <= end_date)
            entries = query_builder.order_by(HistoryEntry.visit_time.desc()).limit(100).all()
            return [serialize_history_entry(entry, include_content) for entry in entries]
    except Exception as e:
        logger.error(f"Search error: {str(e)}", exc_info=True)
        raise HTTPException(
            status_code=500,
            detail={"message": "Search operation failed", "error": str(e)}
        )
@router.get("/search/advanced")
 async def advanced_history_search(
    query: str = Query(..., description="Full-text search query with SQLite FTS5 syntax"),
    include_content: bool = Query(False),
    db: Session = Depends(get_db)
 ):
    """Advanced full-text search using SQLite FTS5 features"""
    try:
        # Use raw SQL for advanced FTS query
        # Add rank and highlights here as well
        fts_query = """
            SELECT
                h.*,
                bm25(history_fts) as rank,
                highlight(history_fts, 0, '<mark>', '</mark>') as title_highlight,
                highlight(history_fts, 1, '<mark>', '</mark>') as content_highlight
            FROM history_fts
            JOIN history h ON history_fts.rowid = h.id
            WHERE history_fts MATCH :query
            ORDER BY rank DESC, h.visit_time DESC
            LIMIT 1000
        """
        results = db.execute(text(fts_query), {'query': query}).fetchall()
        # Use the updated serializer
        return [serialize_history_entry(row, include_content) for row in results]
    except Exception as e:
        logger.error(f"Advanced search error: {e}", exc_info=True)
        raise HTTPException(
            status_code=500,
            detail={"message": "Advanced search operation failed", "error": str(e)}
        )
--- a/app/routers/ui.py
+++ b/app/routers/ui.py
@@ -0,0 +1,52 @@
 from fastapi import APIRouter, Depends, Request
 from fastapi.templating import Jinja2Templates
 from sqlalchemy.orm import Session
 from ..database import get_db, HistoryEntry, Bookmark
 from ..logging_config import setup_logger
 logger = setup_logger(__name__)
 router = APIRouter(tags=["ui"])
 templates = Jinja2Templates(directory="app/templates")
@router.get("/")
 async def home(request: Request, db: Session = Depends(get_db)):
    try:
        # Get recent history entries
        entries = db.query(HistoryEntry)\
            .order_by(HistoryEntry.visit_time.desc())\
            .limit(50)\
            .all()
        return templates.TemplateResponse(
            "index.html",
            {"request": request, "entries": entries}
        )
    except Exception as e:
        logger.error(f"Error loading home page: {e}", exc_info=True)
        # Optionally return an error template
        return templates.TemplateResponse("error.html", {"request": request, "detail": "Could not load history"})
@router.get("/search")
 async def search_page(request: Request):
    return templates.TemplateResponse(
        "search.html",
        {"request": request}
    )
@router.get("/bookmarks")
 async def bookmarks_page(request: Request, db: Session = Depends(get_db)):
    try:
        bookmarks = db.query(Bookmark)\
            .order_by(Bookmark.added_time.desc())\
            .limit(50)\
            .all()
        return templates.TemplateResponse(
            "bookmarks.html",
            {"request": request, "bookmarks": bookmarks}
        )
    except Exception as e:
        logger.error(f"Error loading bookmarks page: {e}", exc_info=True)
        # Optionally return an error template
        return templates.TemplateResponse("error.html", {"request": request, "detail": "Could not load bookmarks"})
--- a/app/routers/websocket.py
+++ b/app/routers/websocket.py
@@ -0,0 +1,175 @@
 import asyncio
 from fastapi import APIRouter, WebSocket, WebSocketDisconnect, Depends, HTTPException
 from sqlalchemy.orm import Session
 from datetime import datetime, timezone, timedelta
 from urllib.parse import urlparse
 import iso8601
 # Import necessary components from other modules
 from .. import main as app_main # To access global crawler instance
 from ..database import get_db, HistoryEntry
 from ..config import Config
 from ..logging_config import setup_logger
 logger = setup_logger(__name__)
 router = APIRouter(tags=["websocket"])
 config = Config() # Assuming config is okay as a separate instance here
@router.websocket("/ws")
 async def websocket_endpoint(websocket: WebSocket, db: Session = Depends(get_db)):
    # Access the global crawler instance from main.py
    crawler = app_main.crawler
    if not crawler:
        logger.error("Crawler not initialized!")
        await websocket.close(code=1011) # Internal Server Error
        return
    logger.info("New WebSocket connection established")
    await websocket.accept()
    try:
        while True:
            data = await websocket.receive_json()
            # Validate incoming data structure (basic check)
            if 'url' not in data or 'timestamp' not in data:
                logger.warning("Received invalid WebSocket message format.")
                await websocket.send_json({
                    "status": "error",
                    "message": "Invalid message format. 'url' and 'timestamp' required."
                })
                continue
            url = data['url']
            try:
                timestamp = iso8601.parse_date(data['timestamp'])
            except iso8601.ParseError:
                logger.warning(f"Received invalid timestamp format: {data['timestamp']}")
                await websocket.send_json({
                    "status": "error",
                    "message": f"Invalid timestamp format: {data['timestamp']}"
                })
                continue
            # Parse the URL and check if domain should be ignored
            try:
                domain = urlparse(url).netloc
                if not domain: # Handle invalid URLs
                     raise ValueError("Could not parse domain from URL")
            except ValueError as e:
                 logger.warning(f"Could not parse URL: {url}. Error: {e}")
                 await websocket.send_json({"status": "error", "message": f"Invalid URL: {url}"})
                 continue
            if config.is_domain_ignored(domain):
                logger.info(f"Ignoring domain: {domain} for URL: {url}")
                await websocket.send_json({
                    "status": "ignored",
                    "message": f"Domain {domain} is in ignore list"
                })
                continue
            logger.info(f"Processing page via WebSocket: {url}")
            # Check if we already have a recent entry for this URL
            # Make timestamp timezone-aware (assuming UTC if naive)
            if timestamp.tzinfo is None:
                timestamp = timestamp.replace(tzinfo=timezone.utc)
            else:
                timestamp = timestamp.astimezone(timezone.utc)
            recent_threshold = timestamp - timedelta(minutes=5)
            existing_entry = db.query(HistoryEntry.id).filter(
                HistoryEntry.url == url,
                HistoryEntry.visit_time >= recent_threshold
            ).first() # Only fetch ID for efficiency
            if existing_entry:
                logger.info(f"Recent entry exists for URL: {url}")
                await websocket.send_json({
                    "status": "skipped",
                    "message": "Recent entry exists"
                })
                continue
            # --- Start crawl4ai processing ---
            logger.info(f"Processing page with crawl4ai: {url}")
            markdown_content = None
            title = ''
            try:
                # Use the global crawler instance
                crawl_result = await crawler.arun(url=url)
                if crawl_result:
                    markdown_content = crawl_result.markdown
                    # Attempt to get title from metadata, fallback to empty string
                    title = getattr(crawl_result.metadata, 'title', '') or '' # Ensure title is string
                    if not title:
                        logger.warning(f"Could not extract title for {url} using crawl4ai.")
                    logger.info(f"crawl4ai processing complete. Markdown length: {len(markdown_content) if markdown_content else 0}, Title: '{title}'")
                else:
                    logger.warning(f"crawl4ai returned None for URL: {url}")
                    markdown_content = "" # Ensure it's not None
                    title = ""
            except Exception as crawl_error:
                logger.error(f"crawl4ai failed for URL {url}: {crawl_error}", exc_info=True)
                await websocket.send_json({
                    "status": "error",
                    "message": f"Failed to crawl page content: {str(crawl_error)}"
                })
                continue # Skip to next message
            # --- End crawl4ai processing ---
            # Only proceed if we got some content or at least a title
            if not title and not markdown_content:
                logger.info(f"No title or content extracted by crawl4ai from: {url}")
                await websocket.send_json({
                    "status": "skipped",
                    "message": "No title or content extracted by crawl4ai"
                })
                continue
            # Create history entry using data from crawl4ai
            history_entry = HistoryEntry(
                url=url,
                title=title, # Use title from crawl4ai
                visit_time=timestamp, # Use the parsed, timezone-aware timestamp
                domain=domain,
                markdown_content=markdown_content, # Use markdown from crawl4ai
                last_content_update=datetime.now(timezone.utc)
            )
            logger.debug(f"Attempting to save entry for {url} with markdown length: {len(markdown_content) if markdown_content else 0}")
            db.add(history_entry)
            try:
                db.commit()
                logger.info(f"Successfully saved entry for: {url}")
                await websocket.send_json({
                    "status": "success",
                    "message": f"Processed page: {url}"
                })
            except Exception as e:
                db.rollback()
                logger.error(f"Error saving entry for {url}: {e}", exc_info=True)
                await websocket.send_json({
                    "status": "error",
                    "message": "Database error occurred while saving."
                })
    except WebSocketDisconnect:
        logger.info("WebSocket client disconnected")
    except Exception as e:
        logger.error(f"Unhandled error in WebSocket handler: {e}", exc_info=True)
        # Attempt to inform client before closing (might fail if connection is already broken)
        try:
            await websocket.send_json({
                "status": "error",
                "message": "An internal server error occurred."
            })
        except Exception:
            pass # Ignore if sending fails
        # Ensure connection is closed on server error
        try:
            await websocket.close(code=1011) # Internal Server Error
        except Exception:
            pass # Ignore if closing fails
--- a/app/scheduler.py
+++ b/app/scheduler.py
@@ -1,142 +1,386 @@
-from fastapi import BackgroundTasks
+from datetime import datetime, timedelta, timezone
 from datetime import datetime, timedelta
 import asyncio
-from .database import SessionLocal, HistoryEntry, Bookmark, get_last_processed_timestamp, update_last_processed_timestamp
+from sqlalchemy import or_, update
 from .database import HistoryEntry, Bookmark, get_last_processed_timestamp, update_last_processed_timestamp
 from .browser import BrowserHistoryCollector
 from .page_reader import PageReader
 from sqlalchemy import func
 from sqlalchemy.orm import Session
 import pytz
 from .config import Config
 from .database import get_db
-from urllib.parse import urlparse
+import urllib.parse
 import logging
 from crawl4ai import AsyncWebCrawler
 from typing import Optional
 logger = logging.getLogger(__name__)
 class HistoryScheduler:
-    def __init__(self):
+    def __init__(self, crawler: AsyncWebCrawler):
        self.browser_collector = BrowserHistoryCollector()
        self.page_reader = PageReader()
        self.last_history_update = None
        self.content_update_interval = timedelta(hours=24)  # Update content daily
        self.config = Config()
        self.db_lock = asyncio.Lock()
        self.crawler = crawler
-    def _normalize_datetime(self, dt: datetime) -> datetime:
+    def _normalize_datetime(self, dt: datetime) -> Optional[datetime]:
-        """Convert datetime to UTC if it has timezone, or make it timezone-aware if it doesn't"""
+        """Convert datetime to UTC if it has timezone, or make it timezone-aware (UTC) if it doesn't"""
        if dt is None:
            return None
-        # If datetime is naive (no timezone), assume it's in UTC
+        # If datetime is naive (no timezone), assume it's local and convert to UTC
-        if dt.tzinfo is None:
+        if dt.tzinfo is None or dt.tzinfo.utcoffset(dt) is None:
-            return pytz.UTC.localize(dt)
+            # Assume local timezone if naive, then convert to UTC
            # This might need adjustment based on where the naive datetime originates
            # If browser_history always returns naive UTC, use: dt.replace(tzinfo=timezone.utc)
            # If browser_history returns naive local time:
            dt = dt.astimezone() # Make timezone-aware using system's local timezone
            return dt.astimezone(timezone.utc) # Convert to UTC
-        # If datetime has timezone, convert to UTC
+        # If datetime already has timezone, convert to UTC
-        return dt.astimezone(pytz.UTC)
+        return dt.astimezone(timezone.utc)
    async def update_bookmarks(self):
        """Update bookmarks from browsers"""
        try:
-            current_timestamp = int(datetime.now().timestamp())
+            # Use timezone-aware current time
            current_timestamp_dt = datetime.now(timezone.utc)
            current_timestamp = int(current_timestamp_dt.timestamp())
            source_key = "browser_bookmarks"
-            last_timestamp = get_last_processed_timestamp(source_key)
+            # Ensure last_timestamp is 0 if None
            last_timestamp = get_last_processed_timestamp(source_key) or 0
-            logger.info(f"Fetching bookmarks. Last processed timestamp: {last_timestamp}")
+            logger.info(f"Fetching bookmarks. Last processed timestamp (UTC epoch): {last_timestamp}")
            bookmarks = self.browser_collector.fetch_bookmarks()
            logger.info(f"Found {len(bookmarks)} total bookmarks")
-            # Filter for only new bookmarks
+            new_bookmarks = []
-            new_bookmarks = [
+            skipped_ignored = 0
-                (added_time, url, title, folder) for added_time, url, title, folder in bookmarks
+            processed_urls = set() # Avoid processing duplicate bookmark URLs within the same batch
-                if self._normalize_datetime(added_time).timestamp() > last_timestamp
+
-            ]
+            for added_time, url, title, folder in bookmarks:
                if not url or url in processed_urls: # Skip empty or duplicate URLs in this batch
                    continue
                # Normalize timestamp *before* comparison
                normalized_added_time = self._normalize_datetime(added_time)
                if normalized_added_time is None:
                    logger.warning(f"Skipping bookmark with invalid timestamp: {url} - {title}")
                    continue
                # Compare timestamps after normalization
                if normalized_added_time.timestamp() > last_timestamp:
                    domain = urllib.parse.urlparse(url).netloc
                    if self.config.is_domain_ignored(domain):
                        # logger.debug(f"Skipping ignored domain for bookmark: {domain}")
                        skipped_ignored += 1
                        continue
                    new_bookmarks.append((normalized_added_time, url, title, folder, domain))
                    processed_urls.add(url) # Mark URL as processed for this batch
            logger.info(f"Found {len(new_bookmarks)} new bookmarks to process after filtering.")
            if skipped_ignored > 0:
                logger.info(f"Skipped {skipped_ignored} bookmarks due to ignored domains.")
            logger.info(f"Found {len(new_bookmarks)} new bookmarks to process")
            if new_bookmarks:
                async with self.db_lock:
                    # Use context manager for session
                    with next(get_db()) as db:
                        added_count = 0
-                        for added_time, url, title, folder in new_bookmarks:
+                        try:
-                            domain = urlparse(url).netloc
+                            for norm_added_time, url, title, folder, domain in new_bookmarks:
-                            if self.config.is_domain_ignored(domain):
+                                # Optional: Check if bookmark already exists (by URL)
-                                logger.debug(f"Skipping ignored domain: {domain}")
+                                # existing = db.query(Bookmark.id).filter(Bookmark.url == url).first()
-                                continue
+                                # if existing:
-
+                                #     logger.debug(f"Bookmark already exists: {url}")
-                            added_time = self._normalize_datetime(added_time)
+                                #     continue
                                bookmark = Bookmark(
                                    url=url,
-                                title=title,
+                                    title=title or "", # Ensure title is not None
-                                added_time=added_time,
+                                    added_time=norm_added_time,
-                                folder=folder,
+                                    folder=folder or "", # Ensure folder is not None
                                    domain=domain
                                )
                                db.add(bookmark)
                                added_count += 1
                            if added_count > 0:
                                db.commit()
-                        logger.info(f"Successfully added {added_count} new bookmarks")
+                                logger.info(f"Successfully committed {added_count} new bookmarks.")
-
+                                # Update timestamp only if new bookmarks were added
                                update_last_processed_timestamp(source_key, current_timestamp)
-                logger.info(f"Updated last processed timestamp to {current_timestamp}")
+                                logger.info(f"Updated last processed bookmark timestamp for '{source_key}' to {current_timestamp}")
                            else:
                                logger.info("No new unique bookmarks to add in this batch.")
                                # Optionally update timestamp even if no *new* bookmarks were added,
                                # to signify the check was performed up to 'current_timestamp'.
                                # update_last_processed_timestamp(source_key, current_timestamp)
                                # logger.info(f"Updated last processed bookmark timestamp check for '{source_key}' to {current_timestamp}")
                        except Exception as e:
                            logger.error(f"Error committing bookmarks: {str(e)}", exc_info=True)
                            db.rollback()
            else:
                logger.info("No new bookmarks found since last check.")
                # Update timestamp to indicate the check was performed
                update_last_processed_timestamp(source_key, current_timestamp)
                logger.info(f"Updated last processed bookmark timestamp check for '{source_key}' to {current_timestamp}")
        except Exception as e:
            logger.error(f"Error updating bookmarks: {str(e)}", exc_info=True)
    async def update_history(self):
        """Background task to update history periodically"""
        # Initial sleep to allow startup tasks (like initial sync) to potentially finish first
        await asyncio.sleep(10)
        while True:
            try:
-                current_timestamp = int(datetime.now().timestamp())
+                # Use timezone-aware current time
-                source_key = "browser_history"
+                current_timestamp_dt = datetime.now(timezone.utc)
-                last_timestamp = get_last_processed_timestamp(source_key)
+                current_timestamp = int(current_timestamp_dt.timestamp())
                source_key = "browser_history_scheduler" # Use a different key than initial sync
                # Ensure last_timestamp is 0 if None
                last_timestamp = get_last_processed_timestamp(source_key) or 0
-                logger.info(f"Fetching history. Last processed timestamp: {last_timestamp}")
+                logger.info(f"Scheduler: Fetching history. Last processed timestamp (UTC epoch): {last_timestamp}")
                history_entries = self.browser_collector.fetch_history()
-                logger.info(f"Found {len(history_entries)} total history entries")
+                logger.info(f"Scheduler: Found {len(history_entries)} total history entries from browser.")
-                # Filter for only new entries
+                new_entries = []
-                new_entries = [
+                skipped_ignored = 0
-                    (visit_time, url, title) for visit_time, url, title in history_entries
+                processed_urls_times = set() # Avoid duplicates within the batch (url, timestamp)
                    if self._normalize_datetime(visit_time).timestamp() > last_timestamp
                ]
-                logger.info(f"Found {len(new_entries)} new history entries to process")
+                for visit_time, url, title in history_entries:
                     # Basic validation
                    if not url or not visit_time:
                        logger.warning(f"Scheduler: Skipping entry with missing URL or timestamp: {title}")
                        continue
                    # Normalize timestamp *before* comparison
                    normalized_visit_time = self._normalize_datetime(visit_time)
                    if normalized_visit_time is None:
                        logger.warning(f"Scheduler: Skipping history with invalid timestamp: {url} - {title}")
                        continue
                    # Compare timestamps after normalization
                    if normalized_visit_time.timestamp() > last_timestamp:
                        entry_key = (url, normalized_visit_time.timestamp())
                        if entry_key in processed_urls_times:
                            continue # Skip duplicate within this batch
                        domain = urllib.parse.urlparse(url).netloc
                        if self.config.is_domain_ignored(domain):
                            # logger.debug(f"Scheduler: Skipping ignored domain: {domain}")
                            skipped_ignored += 1
                            continue
                        new_entries.append((normalized_visit_time, url, title, domain))
                        processed_urls_times.add(entry_key)
                logger.info(f"Scheduler: Found {len(new_entries)} new history entries to process after filtering.")
                if skipped_ignored > 0:
                    logger.info(f"Scheduler: Skipped {skipped_ignored} history entries due to ignored domains.")
                if new_entries:
                    async with self.db_lock:
                        # Use context manager for session
                        with next(get_db()) as db:
                            added_count = 0
-                            for visit_time, url, title in new_entries:
+                            try:
-                                domain = urlparse(url).netloc
+                                for norm_visit_time, url, title, domain in new_entries:
-                                if self.config.is_domain_ignored(domain):
+                                    # Optional: More robust check if entry already exists
-                                    logger.debug(f"Skipping ignored domain: {domain}")
+                                    # existing = db.query(HistoryEntry.id).filter(
-                                    continue
+                                    #     HistoryEntry.url == url,
-
+                                    #     HistoryEntry.visit_time == norm_visit_time
-                                visit_time = self._normalize_datetime(visit_time)
+                                    # ).first()
                                    # if existing:
                                    #     logger.debug(f"Scheduler: History entry already exists: {url} at {norm_visit_time}")
                                    #     continue
                                    history_entry = HistoryEntry(
                                        url=url,
-                                    title=title,
+                                        title=title or "", # Ensure title is not None
-                                    visit_time=visit_time,
+                                        visit_time=norm_visit_time,
                                        domain=domain
                                        # markdown_content is initially NULL
                                    )
                                    db.add(history_entry)
                                    added_count += 1
                                if added_count > 0:
                                    db.commit()
-                            logger.info(f"Successfully added {added_count} new history entries")
+                                    logger.info(f"Scheduler: Successfully committed {added_count} new history entries.")
-
+                                    # Update timestamp only if new entries were added
                                    update_last_processed_timestamp(source_key, current_timestamp)
-                    logger.info(f"Updated last processed timestamp to {current_timestamp}")
+                                    logger.info(f"Scheduler: Updated last processed history timestamp for '{source_key}' to {current_timestamp}")
                                else:
                                    logger.info("Scheduler: No new unique history entries to add in this batch.")
                                    # Optionally update timestamp even if no *new* entries were added
                                    # update_last_processed_timestamp(source_key, current_timestamp)
                                    # logger.info(f"Scheduler: Updated last processed history timestamp check for '{source_key}' to {current_timestamp}")
                            except Exception as e:
-                logger.error(f"Error updating history: {str(e)}", exc_info=True)
+                                logger.error(f"Scheduler: Error committing history: {str(e)}", exc_info=True)
                                db.rollback()
                else:
                    logger.info("Scheduler: No new history entries found since last check.")
                    # Update timestamp to indicate the check was performed
                    update_last_processed_timestamp(source_key, current_timestamp)
                    logger.info(f"Scheduler: Updated last processed history timestamp check for '{source_key}' to {current_timestamp}")
-            await asyncio.sleep(300)  # Wait 5 minutes before next update
+
            except Exception as e:
                logger.error(f"Scheduler: Error in update_history loop: {str(e)}", exc_info=True)
            # --- Access config value using property ---
            try:
                # Use direct attribute access via the @property
                wait_time = self.config.history_update_interval_seconds
            except Exception as config_err:
                logger.error(f"Scheduler (History): Error accessing config for wait time, using default 300s. Error: {config_err}")
                wait_time = 300
            # --- End Access ---
            logger.debug(f"Scheduler (History): Sleeping for {wait_time} seconds.")
            await asyncio.sleep(wait_time) # Use the obtained wait_time
    async def _process_markdown_batch(self):
        """Fetches and processes one batch (up to 10) of history entries needing markdown."""
        entries_to_process = []
        try:
            # --- Query for entries (inside DB lock/session) ---
            async with self.db_lock:
                with next(get_db()) as db:
                    # Find up to 10 entries where markdown_content is NULL or empty string
                    entries_to_process = db.query(HistoryEntry).filter(
                        or_(HistoryEntry.markdown_content == None, HistoryEntry.markdown_content == '')
                    ).order_by(HistoryEntry.visit_time.asc()).limit(10).all()
                    if entries_to_process:
                        logger.info(f"Markdown Processor: Found {len(entries_to_process)} entries to process in this batch.")
                        for entry in entries_to_process:
                            db.expunge(entry) # Detach before async operations
                    else:
                        logger.info("Markdown Processor: No history entries found needing markdown update in this batch.")
                        return # Nothing to do in this batch
            # --- Crawling and Updating (outside the DB lock/session) ---
            processed_count = 0
            skipped_ignored = 0
            for entry in entries_to_process:
                markdown_content = None
                crawl_success = False
                should_update_db = False
                # --- ADD DOMAIN CHECK ---
                try:
                    # +++ Add Debugging Lines +++
                    logger.debug(f"Debugging urllib.parse type: {type(urllib.parse)}")
                    logger.debug(f"Is 'urlparse' in urllib.parse? {'urlparse' in dir(urllib.parse)}")
                    # +++ End Debugging Lines +++
                    domain = urllib.parse.urlparse(entry.url).netloc
                    if self.config.is_domain_ignored(domain):
                        logger.debug(f"Markdown Processor: Skipping ignored domain: {domain} for URL: {entry.url} (ID={entry.id})")
                        skipped_ignored += 1
                        continue
                except Exception as parse_err:
                     logger.warning(f"Markdown Processor: Error parsing URL to get domain: {entry.url} (ID={entry.id}). Type={type(parse_err).__name__} Error: {parse_err}. Skipping entry.")
                     continue
                # --- END DOMAIN CHECK ---
                try:
                    logger.info(f"Markdown Processor: Crawling URL: {entry.url} (ID={entry.id})")
                    if not self.crawler:
                        logger.error("Markdown Processor: Crawler not initialized!")
                        break # Stop processing this batch if crawler is missing
                    result = await self.crawler.arun(url=entry.url)
                    if result and result.markdown:
                        markdown_content = result.markdown
                        crawl_success = True
                        logger.info(f"Markdown Processor: Successfully crawled and got markdown for ID={entry.id}.")
                    else:
                        logger.warning(f"Markdown Processor: Crawling completed but no markdown content found for ID={entry.id}, URL={entry.url}")
                        markdown_content = "" # Mark as processed without content
                        crawl_success = True
                    should_update_db = True
                except Exception as crawl_error:
                    logger.error(f"Markdown Processor: Error crawling URL {entry.url} (ID={entry.id}) Type={type(crawl_error).__name__}: {crawl_error}", exc_info=False)
                    should_update_db = False # Don't update DB on crawl error
                # --- Update DB for this specific entry ---
                if should_update_db:
                    try:
                        async with self.db_lock:
                            with next(get_db()) as db_update:
                                stmt = (
                                    update(HistoryEntry)
                                    .where(HistoryEntry.id == entry.id)
                                    .values(markdown_content=markdown_content)
                                )
                                result_proxy = db_update.execute(stmt)
                                if result_proxy.rowcount > 0:
                                    db_update.commit()
                                    # Adjust log message based on whether it was skipped or processed
                                    if markdown_content == "" and crawl_success and not result.markdown: # Check if marked empty due to no content
                                         logger.info(f"Markdown Processor: Marked entry as processed (no content found) for ID={entry.id}.")
                                    elif crawl_success:
                                         logger.info(f"Markdown Processor: Successfully updated markdown status for ID={entry.id}.")
                                    # Only increment processed_count if actual content was added or marked empty after crawl
                                    if markdown_content is not None: # Includes actual markdown or empty string marker
                                        processed_count += 1
                                else:
                                    logger.warning(f"Markdown Processor: Could not find entry ID={entry.id} to update markdown status (rowcount 0).")
                                    db_update.rollback()
                    except Exception as db_update_error:
                        logger.error(f"Markdown Processor: Error updating database for ID={entry.id}: {db_update_error}", exc_info=True)
            log_suffix = f"Updated {processed_count}"
            if skipped_ignored > 0:
                log_suffix += f", Skipped {skipped_ignored} (ignored domain)"
            log_suffix += f" out of {len(entries_to_process)} entries in this batch."
            logger.info(f"Markdown Processor: Finished processing batch. {log_suffix}")
        except Exception as e:
            logger.error(f"Markdown Processor: Error processing markdown batch: {str(e)}", exc_info=True)
    async def update_missing_markdown_periodically(self):
        """Periodically triggers the processing of batches of history entries needing markdown."""
        # Initial slight delay to ensure startup tasks settle
        await asyncio.sleep(15)
        logger.info("Starting periodic markdown update task...")
        while True:
            await self._process_markdown_batch() # Process one batch
            # Wait before checking for the next batch
            # --- Access config value using property ---
            try:
                # Use direct attribute access via the @property
                wait_time = self.config.markdown_update_interval_seconds
            except Exception as config_err:
                logger.error(f"Periodic Markdown Updater: Error accessing config for wait time, using default 300s. Error: {config_err}")
                wait_time = 300
            # --- End Access ---
            logger.debug(f"Periodic Markdown Updater: Sleeping for {wait_time} seconds before next batch.")
            await asyncio.sleep(wait_time)
    async def close(self):
        """Cleanup resources"""
-        await self.page_reader.close()
+        logger.info("Closing scheduler resources...")
        # Add any specific cleanup needed for BrowserHistoryCollector if necessary
        # The crawler is managed and closed (if needed) in main.py's shutdown
        pass
--- a/app/utils.py
+++ b/app/utils.py
@@ -0,0 +1,45 @@
 from datetime import datetime
 from .database import HistoryEntry, Bookmark
 def serialize_history_entry(entry, include_content: bool = False):
    """Serialize a HistoryEntry object or raw SQL result to a dictionary"""
    # Handle both ORM objects and raw SQL results
    if hasattr(entry, '_mapping'):  # Raw SQL result (from execute)
        result = {
            "id": entry.id,
            "url": entry.url,
            "title": entry.title,
            "visit_time": entry.visit_time.isoformat() if isinstance(entry.visit_time, datetime) else entry.visit_time,
            "domain": entry.domain,
            # Add potential highlight fields if they exist
            "title_highlight": getattr(entry, 'title_highlight', None),
            "content_highlight": getattr(entry, 'content_highlight', None),
            "rank": getattr(entry, 'rank', None)
        }
        if include_content:
            # Ensure markdown_content exists before accessing
            result["markdown_content"] = getattr(entry, 'markdown_content', None)
    else:  # ORM object (from query)
        result = {
            "id": entry.id,
            "url": entry.url,
            "title": entry.title,
            "visit_time": entry.visit_time.isoformat() if entry.visit_time else None,
            "domain": entry.domain,
        }
        if include_content:
            result["markdown_content"] = entry.markdown_content
    return result
 def serialize_bookmark(bookmark):
    """Serialize a Bookmark object to a dictionary"""
    return {
        "id": bookmark.id,
        "url": bookmark.url,
        "title": bookmark.title,
        "added_time": bookmark.added_time.isoformat() if bookmark.added_time else None,
        "folder": bookmark.folder,
        "domain": bookmark.domain,
    }
--- a/config/reader_config.yaml
+++ b/config/reader_config.yaml
@@ -4,13 +4,19 @@ excluded_domains:
  - localhost
  - 127.0.0.1
-  # IP ranges
+  # Specific Domains / Subdomains
  - ap.www.namecheap.com # Ignore this specific subdomain
  - www.namecheap.com # Ignore the main domain (will cover /twofa/* path implicitly)
  - login.linode.com # Ignore the login subdomain
  # IP ranges (requires wildcard matching in config.py)
  - 192.168.*.*
  - 10.*.*.*
  - 172.16.*.*
-  - "0.0.0.*"
+  - 0.0.0.* # Note: Be careful with overly broad patterns
-  # Example wildcard patterns
+
  # Example wildcard patterns (requires wildcard matching in config.py)
  # - *.local
  # - reddit-*.com
  # - *.githubusercontent.com
  # - *.google.com # Example: Ignore all google subdomains
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -0,0 +1,20 @@
 [project]
 name = "browser-recall"
 version = "0.1.0"
 description = "Add your description here"
 readme = "README.md"
 requires-python = ">=3.10.16"
 dependencies = [
    "crawl4ai",
    "fastapi",
    "sqlalchemy",
    "uvicorn",
    "pytz",
    "aiofiles",
    "websockets",
    "pyyaml",
    "browser-history",
    "pydantic",
    "pydantic-settings",
    "iso8601",
 ]
--- a/uv.lock
+++ b/uv.lock