Refactor to use crawl4ai, uv

2025-12-06 10:29:38 +00:00 · 2025-04-11 22:41:46 -05:00
parent 80516440d7
commit 75a2c51b94
14 changed files with 3559 additions and 648 deletions
--- a/app/main.py
+++ b/app/main.py
@@ -1,493 +1,293 @@
-from fastapi import FastAPI, Depends, Query, WebSocket, WebSocketDisconnect, HTTPException
+from fastapi import FastAPI, Depends
 from sqlalchemy.orm import Session
-from datetime import datetime, timezone, timedelta
-from typing import List, Optional
+from datetime import datetime, timezone
+from typing import Optional
 import asyncio
-from fastapi import WebSocketDisconnect
 from urllib.parse import urlparse
-import pytz
 from fastapi.middleware.cors import CORSMiddleware
-import iso8601
-from bs4 import BeautifulSoup
-from sqlalchemy import text
-from sqlalchemy.sql import text
-from .logging_config import setup_logger
-from fastapi.templating import Jinja2Templates
 from fastapi.staticfiles import StaticFiles
-from fastapi import Request
 import browser_history
+from crawl4ai import AsyncWebCrawler
+
+# Local imports
+from .logging_config import setup_logger
 from .database import (
    get_db,
    HistoryEntry,
-    Bookmark,
    get_last_processed_timestamp,
    update_last_processed_timestamp,
    create_tables,
    engine,
-    recreate_fts_tables
+    # recreate_fts_tables # Keep if needed, but often done manually or via migration tool
 )
-from .scheduler import HistoryScheduler
-from .page_info import PageInfo
-from .page_reader import PageReader
 from .config import Config
-from sqlalchemy.ext.declarative import declarative_base
+
+# Import Routers
+from .routers import history, bookmarks, config as api_config, websocket, ui

 logger = setup_logger(__name__)

-app = FastAPI()
-scheduler = HistoryScheduler()
-config = Config()
+# --- Global Variables ---
+# These are accessed by other modules (like websocket router)
+# Consider using app state or dependency injection for cleaner management if complexity grows
+config_manager = Config() # Renamed to avoid conflict with router import
+crawler: Optional[AsyncWebCrawler] = None

-# Add CORS middleware to allow WebSocket connections
+# Import scheduler *after* crawler is defined
+from .scheduler import HistoryScheduler
+scheduler: Optional[HistoryScheduler] = None # Now initialize scheduler variable
+
+# --- FastAPI App Initialization ---
+app = FastAPI(title="Browser History Search API")
+
+# Add CORS middleware
 app.add_middleware(
    CORSMiddleware,
-    allow_origins=["*"],  # In production, specify your domains
+    allow_origins=["*"],  # Adjust in production
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
 )

-templates = Jinja2Templates(directory="app/templates")
+# Mount static files and templates
 app.mount("/static", StaticFiles(directory="app/static"), name="static")
+# Note: Templates are used within the ui router now, no need for global instance here unless used elsewhere

-@app.on_event("startup")
-async def startup_event():
-    logger.info("Starting application")
-
-    try:
-        # First create the base tables
-        logger.info("Creating base tables...")
-        create_tables()
-
-        # # Drop and recreate FTS tables
-        # logger.info("Recreating FTS tables...")
-        # with engine.connect() as conn:
-        #     # First check if the main history table exists
-        #     result = conn.execute(text(
-        #         "SELECT name FROM sqlite_master WHERE type='table' AND name='history'"
-        #     )).fetchone()
-
-        #     if not result:
-        #         logger.info("Main history table doesn't exist yet, creating tables...")
-        #         Base.metadata.create_all(bind=engine)
-
-        #     # Now recreate FTS tables
-        #     logger.info("Dropping and recreating FTS tables...")
-        #     recreate_fts_tables()
-
-        #     logger.info("FTS tables recreation completed")
-
-        # Initial history and bookmark fetch
-        logger.info("Processing initial browser history...")
-        process_browser_history()
-
-        logger.info("Updating bookmarks...")
-        await scheduler.update_bookmarks()
-
-        # Start the background tasks
-        logger.info("Starting background tasks...")
-        asyncio.create_task(scheduler.update_history())
-
-        logger.info("Startup completed successfully")
-
-    except Exception as e:
-        logger.error(f"Error during startup: {str(e)}", exc_info=True)
-        raise
-
-def serialize_history_entry(entry, include_content: bool = False):
-    """Serialize a HistoryEntry object to a dictionary"""
-    # Handle both ORM objects and raw SQL results
-    if hasattr(entry, '_mapping'):  # Raw SQL result
-        result = {
-            "id": entry.id,
-            "url": entry.url,
-            "title": entry.title,
-            "visit_time": entry.visit_time.isoformat() if isinstance(entry.visit_time, datetime) else entry.visit_time,
-            "domain": entry.domain,
-        }
-    else:  # ORM object
-        result = {
-            "id": entry.id,
-            "url": entry.url,
-            "title": entry.title,
-            "visit_time": entry.visit_time.isoformat() if entry.visit_time else None,
-            "domain": entry.domain,
-        }
-
-    if include_content:
-        result["markdown_content"] = entry.markdown_content
-    return result
-
-def serialize_bookmark(bookmark):
-    """Serialize a Bookmark object to a dictionary"""
-    return {
-        "id": bookmark.id,
-        "url": bookmark.url,
-        "title": bookmark.title,
-        "added_time": bookmark.added_time.isoformat() if bookmark.added_time else None,
-        "folder": bookmark.folder,
-        "domain": bookmark.domain,
-    }
-
-@app.get("/history/search")
-async def search_history(
-    query: Optional[str] = Query(None),
-    domain: Optional[str] = Query(None),
-    start_date: Optional[str] = Query(None),
-    end_date: Optional[str] = Query(None),
-    include_content: bool = Query(False),
-    db: Session = Depends(get_db)
-):
-    """Search history using FTS5"""
-    try:
-        if query:
-            # Build the FTS query
-            fts_conditions = [f'title:{query}* OR markdown_content:{query}*']
-            params = {'query': query}
-
-            if domain:
-                fts_conditions.append(f'domain:"{domain}"')
-
-            fts_query = ' AND '.join(fts_conditions)
-
-            # Build the SQL query
-            sql = """
-                SELECT
-                    h.*,
-                    bm25(history_fts) as rank,
-                    highlight(history_fts, 0, '<mark>', '</mark>') as title_highlight,
-                    highlight(history_fts, 1, '<mark>', '</mark>') as content_highlight
-                FROM history_fts
-                JOIN history h ON history_fts.rowid = h.id
-                WHERE history_fts MATCH :fts_query
-            """
-
-            # Add date filters if provided
-            if start_date:
-                sql += " AND h.visit_time >= :start_date"
-                params['start_date'] = start_date
-            if end_date:
-                sql += " AND h.visit_time <= :end_date"
-                params['end_date'] = end_date
-
-            sql += " ORDER BY rank, h.visit_time DESC LIMIT 100"
-
-            params['fts_query'] = fts_query
-
-            results = db.execute(text(sql), params).fetchall()
-            return [serialize_history_entry(row, include_content) for row in results]
-
-        else:
-            # Handle non-search queries
-            query = db.query(HistoryEntry)
-
-            if domain:
-                query = query.filter(HistoryEntry.domain == domain)
-            if start_date:
-                query = query.filter(HistoryEntry.visit_time >= start_date)
-            if end_date:
-                query = query.filter(HistoryEntry.visit_time <= end_date)
-
-            entries = query.order_by(HistoryEntry.visit_time.desc()).limit(100).all()
-            return [serialize_history_entry(entry, include_content) for entry in entries]
-
-    except Exception as e:
-        logger.error(f"Search error: {str(e)}", exc_info=True)
-        raise HTTPException(
-            status_code=500,
-            detail={"message": "Search operation failed", "error": str(e)}
-        )
-
-@app.get("/bookmarks/search")
-async def search_bookmarks(
-    domain: Optional[str] = Query(None),
-    folder: Optional[str] = Query(None),
-    search_term: Optional[str] = Query(None),
-    db: Session = Depends(get_db)
-):
-    """Search bookmarks with optimized queries"""
-    try:
-        # Build query efficiently
-        query = db.query(Bookmark)
-
-        # Apply filters using index-optimized queries
-        if domain:
-            query = query.filter(Bookmark.domain == domain)
-
-        if folder:
-            query = query.filter(Bookmark.folder == folder)
-
-        if search_term:
-            # Use LIKE with index hint for title search
-            search_pattern = f"%{search_term}%"
-            query = query.filter(
-                Bookmark.title.ilike(search_pattern)
-            ).with_hint(
-                Bookmark,
-                'INDEXED BY ix_bookmarks_title',
-                'sqlite'
-            )
-
-        # Add ordering and limit for better performance
-        bookmarks = query.order_by(Bookmark.added_time.desc()).limit(1000).all()
-
-        return [serialize_bookmark(bookmark) for bookmark in bookmarks]
-
-    except Exception as e:
-        print(f"Bookmark search error: {e}")
-        raise HTTPException(status_code=500, detail="Search operation failed")
-
-# Add new endpoint for advanced full-text search
-@app.get("/history/search/advanced")
-async def advanced_history_search(
-    query: str = Query(..., description="Full-text search query with SQLite FTS5 syntax"),
-    include_content: bool = Query(False),
-    db: Session = Depends(get_db)
-):
-    """Advanced full-text search using SQLite FTS5 features"""
-    try:
-        # Use raw SQL for advanced FTS query
-        fts_query = """
-            SELECT h.*, rank
-            FROM history h
-            INNER JOIN history_fts f ON h.id = f.rowid
-            WHERE history_fts MATCH :query
-            ORDER BY rank
-            LIMIT 1000
-        """
-
-        results = db.execute(text(fts_query), {'query': query}).all()
-
-        # Convert results to HistoryEntry objects
-        entries = [
-            serialize_history_entry(
-                HistoryEntry(
-                    id=row.id,
-                    url=row.url,
-                    title=row.title,
-                    visit_time=row.visit_time,
-                    domain=row.domain,
-                    markdown_content=row.markdown_content if include_content else None
-                ),
-                include_content
-            )
-            for row in results
-        ]
-
-        return entries
-
-    except Exception as e:
-        print(f"Advanced search error: {e}")
-        raise HTTPException(status_code=500, detail="Advanced search operation failed")
-
-@app.websocket("/ws")
-async def websocket_endpoint(websocket: WebSocket, db: Session = Depends(get_db)):
-    logger.info("New WebSocket connection established")
-    page_reader = PageReader()
-    await websocket.accept()
-    try:
-        while True:
-            data = await websocket.receive_json()
-
-            # Parse the URL and check if domain should be ignored
-            domain = urlparse(data['url']).netloc
-            if config.is_domain_ignored(domain):
-                logger.info(f"Ignoring domain: {domain}")
-                await websocket.send_json({
-                    "status": "ignored",
-                    "message": f"Domain {domain} is in ignore list"
-                })
-                continue
-
-            logger.info(f"Processing page: {data['url']}")
-            timestamp = iso8601.parse_date(data['timestamp'])
-
-            # Check if we already have a recent entry for this URL
-            existing_entry = db.query(HistoryEntry).filter(
-                HistoryEntry.url == data['url'],
-                HistoryEntry.visit_time >= timestamp - timedelta(minutes=5)
-            ).first()
-
-            if existing_entry:
-                print(f"Recent entry exists for URL: {data['url']}")
-                await websocket.send_json({
-                    "status": "skipped",
-                    "message": "Recent entry exists"
-                })
-                continue
-
-            page_info = PageInfo(
-                url=data['url'],
-                html=data['html'],
-                timestamp=timestamp
-            )
-
-            # Debug HTML content
-            print(f"HTML content length before processing: {len(page_info.html)}")
-
-            # Extract title
-            soup = BeautifulSoup(page_info.html, 'html.parser')
-            title = soup.title.string if soup.title else ''
-            print(f"Extracted title: {title}")
-
-            # Debug markdown conversion
-            print("Starting markdown conversion...")
-            cleaned_html = page_reader.clean_html(page_info.html)
-            print(f"Cleaned HTML length: {len(cleaned_html)}")
-
-            markdown_content = page_reader.html_to_markdown(page_info.html)
-            print(f"Markdown conversion complete. Content length: {len(markdown_content) if markdown_content else 0}")
-
-            if markdown_content:
-                print("First 100 chars of markdown:", markdown_content[:100])
-            else:
-                print("No markdown content generated")
-
-            if not title and not markdown_content:
-                print(f"No content extracted from: {page_info.url}")
-                await websocket.send_json({
-                    "status": "skipped",
-                    "message": "No content extracted"
-                })
-                continue
-
-            # Create history entry
-            history_entry = HistoryEntry(
-                url=page_info.url,
-                title=title,
-                visit_time=page_info.timestamp,
-                domain=domain,
-                markdown_content=markdown_content,
-                last_content_update=datetime.now(timezone.utc)
-            )
-
-            # Debug database operation
-            print(f"Saving entry with markdown length: {len(markdown_content) if markdown_content else 0}")
-
-            # Use bulk operations for better performance
-            db.add(history_entry)
-
-            try:
-                db.commit()
-                print(f"Successfully saved entry for: {page_info.url}")
-                print(f"Verify markdown content length in database: {len(history_entry.markdown_content) if history_entry.markdown_content else 0}")
-                await websocket.send_json({
-                    "status": "success",
-                    "message": f"Processed page: {page_info.url}"
-                })
-            except Exception as e:
-                db.rollback()
-                print(f"Error saving entry: {e}")
-                await websocket.send_json({
-                    "status": "error",
-                    "message": "Database error"
-                })
-
-    except WebSocketDisconnect:
-        logger.info("Client disconnected")
-    except Exception as e:
-        logger.error("Error in WebSocket handler", exc_info=True)
-    finally:
-        await page_reader.close()
-
-@app.get("/config/ignored-domains")
-async def get_ignored_domains():
-    """Get list of ignored domain patterns"""
-    return {"ignored_domains": config.config.get('ignored_domains', [])}
-
-@app.post("/config/ignored-domains")
-async def add_ignored_domain(pattern: str):
-    """Add a new domain pattern to ignored list"""
-    config.add_ignored_domain(pattern)
-    return {"status": "success", "message": f"Added pattern: {pattern}"}
-
-@app.delete("/config/ignored-domains/{pattern}")
-async def remove_ignored_domain(pattern: str):
-    """Remove a domain pattern from ignored list"""
-    config.remove_ignored_domain(pattern)
-    return {"status": "success", "message": f"Removed pattern: {pattern}"}
-
-@app.get("/")
-async def home(request: Request, db: Session = Depends(get_db)):
-    # Get recent history entries
-    entries = db.query(HistoryEntry)\
-        .order_by(HistoryEntry.visit_time.desc())\
-        .limit(50)\
-        .all()
-    return templates.TemplateResponse(
-        "index.html",
-        {"request": request, "entries": entries}
-    )
-
-@app.get("/search")
-async def search_page(request: Request):
-    return templates.TemplateResponse(
-        "search.html",
-        {"request": request}
-    )
-
-@app.get("/bookmarks")
-async def bookmarks_page(request: Request, db: Session = Depends(get_db)):
-    bookmarks = db.query(Bookmark)\
-        .order_by(Bookmark.added_time.desc())\
-        .limit(50)\
-        .all()
-    return templates.TemplateResponse(
-        "bookmarks.html",
-        {"request": request, "bookmarks": bookmarks}
-    )
-
+# --- Helper Function (Initial Sync) ---
 def process_browser_history():
+    """Fetches and stores new history entries from browser_history library (Initial Sync)."""
    try:
-        logger.info("Starting browser history processing")
+        logger.info("Starting browser history processing (initial sync)")
        outputs = browser_history.get_history()
-        history_list = outputs.histories  # This is a list of tuples (timestamp, url, title)
-        logger.info(f"Found {len(history_list)} total history items")
+        # browser_history returns platform specific History object, get histories list
+        history_list = []
+        if hasattr(outputs, 'histories') and outputs.histories:
+             history_list = outputs.histories # List of (datetime, url, title)
+        else:
+             logger.warning("Could not retrieve histories list from browser_history output.")
+             return # Exit if no history list found

-        current_timestamp = int(datetime.now().timestamp())
-        source_key = "browser_history"  # Single source since we get combined history
-        last_timestamp = get_last_processed_timestamp(source_key)
+        logger.info(f"Found {len(history_list)} total history items from browser_history library")

-        logger.info(f"Last processed timestamp: {last_timestamp}")
+        current_timestamp_dt = datetime.now(timezone.utc)
+        current_timestamp = int(current_timestamp_dt.timestamp()) # Use timezone-aware timestamp
+        source_key = "browser_history_sync" # Differentiate from scheduler source
+        last_timestamp = get_last_processed_timestamp(source_key) or 0 # Ensure it's 0 if None

-        # Filter for only new entries
-        new_entries = [
-            entry for entry in history_list
-            if entry[0].timestamp() > last_timestamp
-        ]
+        logger.info(f"Last processed timestamp for initial sync '{source_key}': {last_timestamp}")

-        logger.info(f"Found {len(new_entries)} new entries")
+        new_entries = []
+        processed_urls_times = set() # Avoid duplicates within the batch
+
+        for entry in history_list:
+            # Basic validation of entry structure
+            if not isinstance(entry, (tuple, list)) or len(entry) < 2:
+                logger.warning(f"Skipping malformed history entry: {entry}")
+                continue
+            timestamp, url = entry[0], entry[1]
+            title = entry[2] if len(entry) > 2 else "" # Handle optional title
+
+            if not url or not timestamp:
+                logger.warning(f"Skipping entry with missing URL or timestamp: Title='{title}'")
+                continue
+
+            # Ensure timestamp is datetime object
+            if not isinstance(timestamp, datetime):
+                 logger.warning(f"Skipping entry with non-datetime timestamp ({type(timestamp)}): {url}")
+                 continue
+
+            # Normalize timestamp (Assume local if naive, convert to UTC)
+            if timestamp.tzinfo is None or timestamp.tzinfo.utcoffset(timestamp) is None:
+                try:
+                    timestamp_aware = timestamp.astimezone() # Make aware using system local
+                except Exception as tz_err:
+                     logger.warning(f"Could not determine local timezone for naive timestamp {timestamp}. Assuming UTC. Error: {tz_err}")
+                     timestamp_aware = timestamp.replace(tzinfo=timezone.utc) # Fallback to UTC
+            else:
+                timestamp_aware = timestamp
+            timestamp_utc = timestamp_aware.astimezone(timezone.utc)
+
+
+            # Filter for only new entries based on normalized UTC timestamp
+            if timestamp_utc.timestamp() > last_timestamp:
+                entry_key = (url, timestamp_utc.timestamp())
+                if entry_key in processed_urls_times:
+                    continue # Skip duplicate within this batch
+
+                new_entries.append((timestamp_utc, url, title))
+                processed_urls_times.add(entry_key)
+
+        logger.info(f"Found {len(new_entries)} new entries for initial sync after filtering")

        if new_entries:
-            for timestamp, url, title in new_entries:
-                logger.info(f"Processing entry: {timestamp} - {url}")
-                domain = urlparse(url).netloc
-                if config.is_domain_ignored(domain):
-                    logger.debug(f"Skipping ignored domain: {domain}")
-                    continue
-
-                # Create history entry
-                db = next(get_db())
+            added_count = 0
+            skipped_ignored = 0
+            # Use context manager for session
+            with next(get_db()) as db:
                try:
-                    history_entry = HistoryEntry(
-                        url=url,
-                        title=title,
-                        visit_time=timestamp,
-                        domain=domain
-                    )
-                    db.add(history_entry)
-                    db.commit()
+                    for timestamp_utc, url, title in new_entries:
+                        domain = urlparse(url).netloc
+                        if config_manager.is_domain_ignored(domain):
+                            # logger.debug(f"Skipping ignored domain during initial sync: {domain}")
+                            skipped_ignored += 1
+                            continue
+
+                        # Optional: Check if entry already exists more robustly
+                        # existing = db.query(HistoryEntry.id).filter(HistoryEntry.url == url, HistoryEntry.visit_time == timestamp_utc).first()
+                        # if existing:
+                        #     continue
+
+                        history_entry = HistoryEntry(
+                            url=url,
+                            title=title or "", # Ensure title is not None
+                            visit_time=timestamp_utc,
+                            domain=domain
+                            # Note: No markdown content here, only basic history
+                        )
+                        db.add(history_entry)
+                        added_count += 1
+
+                    if added_count > 0:
+                        db.commit()
+                        logger.info(f"Committed {added_count} new history entries from initial sync.")
+                        # Update the last processed timestamp only if successful commit
+                        update_last_processed_timestamp(source_key, current_timestamp)
+                        logger.info(f"Updated initial sync timestamp for '{source_key}' to {current_timestamp}")
+                    else:
+                         logger.info("No new unique entries to commit during initial sync.")
+                         # Update timestamp even if nothing new added, to mark sync time
+                         update_last_processed_timestamp(source_key, current_timestamp)
+                         logger.info(f"Updated initial sync timestamp check for '{source_key}' to {current_timestamp}")
+
+
+                    if skipped_ignored > 0:
+                        logger.info(f"Skipped {skipped_ignored} entries due to ignored domains during initial sync.")
+
                except Exception as e:
-                    logger.error(f"Error storing history item: {str(e)}")
+                    logger.error(f"Error storing history item during initial sync: {str(e)}", exc_info=True)
                    db.rollback()
-                finally:
-                    db.close()
+        else:
+             logger.info("No new history entries found during initial sync.")
+             # Update timestamp even if nothing new found, to mark sync time
+             update_last_processed_timestamp(source_key, current_timestamp)
+             logger.info(f"Updated initial sync timestamp check for '{source_key}' to {current_timestamp}")

-            # Update the last processed timestamp
-            update_last_processed_timestamp(source_key, current_timestamp)
-            logger.info(f"Updated timestamp to {current_timestamp}")

-        logger.info(f"Processed {len(new_entries)} new items")
+    except ImportError:
+         logger.warning("`browser_history` library not found or import failed. Skipping initial sync.")
+    except Exception as e:
+        logger.error(f"Error processing browser history during initial sync: {str(e)}", exc_info=True)
+
+
+# --- Startup and Shutdown Events ---
+@app.on_event("startup")
+async def startup_event():
+    global crawler, scheduler # Allow modification of globals
+    logger.info("Starting application initialization...")
+
+    try:
+        # 1. Ensure base tables exist
+        logger.info("Ensuring base tables exist...")
+        create_tables()
+
+        # 2. Initialize the crawler
+        logger.info("Initializing AsyncWebCrawler...")
+        if crawler is None:
+             crawler = AsyncWebCrawler()
+        logger.info("AsyncWebCrawler initialized.")
+
+        # 3. Initialize the Scheduler *after* the crawler
+        logger.info("Initializing HistoryScheduler...")
+        if scheduler is None:
+            scheduler = HistoryScheduler(crawler=crawler) # Pass crawler instance
+        logger.info("HistoryScheduler initialized.")
+
+        # 4. Perform initial history sync from browser_history library
+        logger.info("Performing initial browser history sync...")
+        process_browser_history() # Sync history not processed before
+
+        # 5. Perform initial bookmark sync (using scheduler's method)
+        # Run in background to avoid blocking startup if it takes long
+        logger.info("Starting initial bookmark sync task...")
+        asyncio.create_task(scheduler.update_bookmarks())
+
+        # 6. Start background tasks (scheduler for ongoing updates)
+        logger.info("Starting background history update task...")
+        asyncio.create_task(scheduler.update_history())
+
+        # --- Markdown Update Tasks ---
+        # 7a. Trigger ONE initial batch processing run in the background
+        logger.info("Starting initial markdown processing batch task...")
+        asyncio.create_task(scheduler._process_markdown_batch()) # Run one batch now
+
+        # 7b. Start the PERIODIC background markdown update task
+        logger.info("Starting periodic background markdown update task...")
+        # Use the renamed method for the loop
+        asyncio.create_task(scheduler.update_missing_markdown_periodically())
+        # --- End Markdown Update Tasks ---
+
+
+        logger.info("Application startup sequence initiated. Background tasks running.")

    except Exception as e:
-        logger.error(f"Error processing browser history: {str(e)}", exc_info=True)
+        logger.error(f"FATAL ERROR during application startup: {str(e)}", exc_info=True)
+        raise RuntimeError(f"Application startup failed: {e}") from e
+
+
+@app.on_event("shutdown")
+async def shutdown_event():
+    global crawler, scheduler
+    logger.info("Starting application shutdown...")
+
+    # Stop scheduler tasks gracefully if possible (implement cancellation in tasks if needed)
+    # For now, we just close resources
+
+    # Close scheduler resources
+    if scheduler and hasattr(scheduler, 'close'):
+         try:
+             logger.info("Closing scheduler resources...")
+             await scheduler.close() # Call the scheduler's close method
+         except Exception as e:
+             logger.error(f"Error closing scheduler: {e}", exc_info=True)
+
+    # Close crawler if needed (check crawl4ai docs for explicit close method)
+    # Based on previous code, seems no explicit close needed, but keep check just in case
+    if crawler and hasattr(crawler, 'aclose'):
+        try:
+            logger.info("Closing AsyncWebCrawler...")
+            # await crawler.aclose() # Example if an async close exists
+        except Exception as e:
+            logger.error(f"Error closing crawler: {e}", exc_info=True)
+
+
+    # Close database engine connections if necessary (usually handled automatically by SQLAlchemy)
+    # if engine and hasattr(engine, 'dispose'): # Check if using async engine that needs dispose
+    #    await engine.dispose()
+
+    logger.info("Application shutdown complete.")
+
+
+# --- Include Routers ---
+app.include_router(history.router)
+app.include_router(bookmarks.router)
+app.include_router(api_config.router)
+app.include_router(websocket.router)
+app.include_router(ui.router)
+
+# Optional: Add a root endpoint for health check or basic info
+@app.get("/health", tags=["service"])
+async def health_check():
+    # Extended health check could verify DB connection or task status
+    db_ok = False
+    try:
+        with next(get_db()) as db:
+            db.execute("SELECT 1")
+            db_ok = True
+    except Exception:
+        db_ok = False
+
+    return {
+        "status": "ok",
+        "database_connection": "ok" if db_ok else "error",
+        # Add other checks as needed
+    }