browser-recall/app/main.py

from fastapi import FastAPI, Depends
from sqlalchemy.orm import Session
from datetime import datetime, timezone
from typing import Optional
import asyncio
from urllib.parse import urlparse
from fastapi.middleware.cors import CORSMiddleware
from fastapi.staticfiles import StaticFiles
import browser_history
from crawl4ai import AsyncWebCrawler

# Local imports
from .logging_config import setup_logger
from .database import (
    get_db,
    HistoryEntry,
    get_last_processed_timestamp,
    update_last_processed_timestamp,
    create_tables,
    engine,
    # recreate_fts_tables # Keep if needed, but often done manually or via migration tool
)
from .config import Config

# Import Routers
from .routers import history, bookmarks, config as api_config, websocket, ui

logger = setup_logger(__name__)

# --- Global Variables ---
# These are accessed by other modules (like websocket router)
# Consider using app state or dependency injection for cleaner management if complexity grows
config_manager = Config() # Renamed to avoid conflict with router import
crawler: Optional[AsyncWebCrawler] = None

# Import scheduler *after* crawler is defined
from .scheduler import HistoryScheduler
scheduler: Optional[HistoryScheduler] = None # Now initialize scheduler variable

# --- FastAPI App Initialization ---
app = FastAPI(title="Browser History Search API")

# Add CORS middleware
app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],  # Adjust in production
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)

# Mount static files and templates
app.mount("/static", StaticFiles(directory="app/static"), name="static")
# Note: Templates are used within the ui router now, no need for global instance here unless used elsewhere

# --- Helper Function (Initial Sync) ---
def process_browser_history():
    """Fetches and stores new history entries from browser_history library (Initial Sync)."""
    try:
        logger.info("Starting browser history processing (initial sync)")
        outputs = browser_history.get_history()
        # browser_history returns platform specific History object, get histories list
        history_list = []
        if hasattr(outputs, 'histories') and outputs.histories:
             history_list = outputs.histories # List of (datetime, url, title)
        else:
             logger.warning("Could not retrieve histories list from browser_history output.")
             return # Exit if no history list found

        logger.info(f"Found {len(history_list)} total history items from browser_history library")

        current_timestamp_dt = datetime.now(timezone.utc)
        current_timestamp = int(current_timestamp_dt.timestamp()) # Use timezone-aware timestamp
        source_key = "browser_history_sync" # Differentiate from scheduler source
        last_timestamp = get_last_processed_timestamp(source_key) or 0 # Ensure it's 0 if None

        logger.info(f"Last processed timestamp for initial sync '{source_key}': {last_timestamp}")

        new_entries = []
        processed_urls_times = set() # Avoid duplicates within the batch

        for entry in history_list:
            # Basic validation of entry structure
            if not isinstance(entry, (tuple, list)) or len(entry) < 2:
                logger.warning(f"Skipping malformed history entry: {entry}")
                continue
            timestamp, url = entry[0], entry[1]
            title = entry[2] if len(entry) > 2 else "" # Handle optional title

            if not url or not timestamp:
                logger.warning(f"Skipping entry with missing URL or timestamp: Title='{title}'")
                continue

            # Ensure timestamp is datetime object
            if not isinstance(timestamp, datetime):
                 logger.warning(f"Skipping entry with non-datetime timestamp ({type(timestamp)}): {url}")
                 continue

            # Normalize timestamp (Assume local if naive, convert to UTC)
            if timestamp.tzinfo is None or timestamp.tzinfo.utcoffset(timestamp) is None:
                try:
                    timestamp_aware = timestamp.astimezone() # Make aware using system local
                except Exception as tz_err:
                     logger.warning(f"Could not determine local timezone for naive timestamp {timestamp}. Assuming UTC. Error: {tz_err}")
                     timestamp_aware = timestamp.replace(tzinfo=timezone.utc) # Fallback to UTC
            else:
                timestamp_aware = timestamp
            timestamp_utc = timestamp_aware.astimezone(timezone.utc)


            # Filter for only new entries based on normalized UTC timestamp
            if timestamp_utc.timestamp() > last_timestamp:
                entry_key = (url, timestamp_utc.timestamp())
                if entry_key in processed_urls_times:
                    continue # Skip duplicate within this batch

                new_entries.append((timestamp_utc, url, title))
                processed_urls_times.add(entry_key)

        logger.info(f"Found {len(new_entries)} new entries for initial sync after filtering")

        if new_entries:
            added_count = 0
            skipped_ignored = 0
            # Use context manager for session
            with next(get_db()) as db:
                try:
                    for timestamp_utc, url, title in new_entries:
                        domain = urlparse(url).netloc
                        if config_manager.is_domain_ignored(domain):
                            # logger.debug(f"Skipping ignored domain during initial sync: {domain}")
                            skipped_ignored += 1
                            continue

                        # Optional: Check if entry already exists more robustly
                        # existing = db.query(HistoryEntry.id).filter(HistoryEntry.url == url, HistoryEntry.visit_time == timestamp_utc).first()
                        # if existing:
                        #     continue

                        history_entry = HistoryEntry(
                            url=url,
                            title=title or "", # Ensure title is not None
                            visit_time=timestamp_utc,
                            domain=domain
                            # Note: No markdown content here, only basic history
                        )
                        db.add(history_entry)
                        added_count += 1

                    if added_count > 0:
                        db.commit()
                        logger.info(f"Committed {added_count} new history entries from initial sync.")
                        # Update the last processed timestamp only if successful commit
                        update_last_processed_timestamp(source_key, current_timestamp)
                        logger.info(f"Updated initial sync timestamp for '{source_key}' to {current_timestamp}")
                    else:
                         logger.info("No new unique entries to commit during initial sync.")
                         # Update timestamp even if nothing new added, to mark sync time
                         update_last_processed_timestamp(source_key, current_timestamp)
                         logger.info(f"Updated initial sync timestamp check for '{source_key}' to {current_timestamp}")


                    if skipped_ignored > 0:
                        logger.info(f"Skipped {skipped_ignored} entries due to ignored domains during initial sync.")

                except Exception as e:
                    logger.error(f"Error storing history item during initial sync: {str(e)}", exc_info=True)
                    db.rollback()
        else:
             logger.info("No new history entries found during initial sync.")
             # Update timestamp even if nothing new found, to mark sync time
             update_last_processed_timestamp(source_key, current_timestamp)
             logger.info(f"Updated initial sync timestamp check for '{source_key}' to {current_timestamp}")


    except ImportError:
         logger.warning("`browser_history` library not found or import failed. Skipping initial sync.")
    except Exception as e:
        logger.error(f"Error processing browser history during initial sync: {str(e)}", exc_info=True)


# --- Startup and Shutdown Events ---
@app.on_event("startup")
async def startup_event():
    global crawler, scheduler # Allow modification of globals
    logger.info("Starting application initialization...")

    try:
        # 1. Ensure base tables exist
        logger.info("Ensuring base tables exist...")
        create_tables()

        # 2. Initialize the crawler
        logger.info("Initializing AsyncWebCrawler...")
        if crawler is None:
             crawler = AsyncWebCrawler()
        logger.info("AsyncWebCrawler initialized.")

        # 3. Initialize the Scheduler *after* the crawler
        logger.info("Initializing HistoryScheduler...")
        if scheduler is None:
            scheduler = HistoryScheduler(crawler=crawler) # Pass crawler instance
        logger.info("HistoryScheduler initialized.")

        # 4. Perform initial history sync from browser_history library
        logger.info("Performing initial browser history sync...")
        process_browser_history() # Sync history not processed before

        # 5. Perform initial bookmark sync (using scheduler's method)
        # Run in background to avoid blocking startup if it takes long
        logger.info("Starting initial bookmark sync task...")
        asyncio.create_task(scheduler.update_bookmarks())

        # 6. Start background tasks (scheduler for ongoing updates)
        logger.info("Starting background history update task...")
        asyncio.create_task(scheduler.update_history())

        # --- Markdown Update Tasks ---
        # 7a. Trigger ONE initial batch processing run in the background
        logger.info("Starting initial markdown processing batch task...")
        asyncio.create_task(scheduler._process_markdown_batch()) # Run one batch now

        # 7b. Start the PERIODIC background markdown update task
        logger.info("Starting periodic background markdown update task...")
        # Use the renamed method for the loop
        asyncio.create_task(scheduler.update_missing_markdown_periodically())
        # --- End Markdown Update Tasks ---


        logger.info("Application startup sequence initiated. Background tasks running.")

    except Exception as e:
        logger.error(f"FATAL ERROR during application startup: {str(e)}", exc_info=True)
        raise RuntimeError(f"Application startup failed: {e}") from e


@app.on_event("shutdown")
async def shutdown_event():
    global crawler, scheduler
    logger.info("Starting application shutdown...")

    # Stop scheduler tasks gracefully if possible (implement cancellation in tasks if needed)
    # For now, we just close resources

    # Close scheduler resources
    if scheduler and hasattr(scheduler, 'close'):
         try:
             logger.info("Closing scheduler resources...")
             await scheduler.close() # Call the scheduler's close method
         except Exception as e:
             logger.error(f"Error closing scheduler: {e}", exc_info=True)

    # Close crawler if needed (check crawl4ai docs for explicit close method)
    # Based on previous code, seems no explicit close needed, but keep check just in case
    if crawler and hasattr(crawler, 'aclose'):
        try:
            logger.info("Closing AsyncWebCrawler...")
            # await crawler.aclose() # Example if an async close exists
        except Exception as e:
            logger.error(f"Error closing crawler: {e}", exc_info=True)


    # Close database engine connections if necessary (usually handled automatically by SQLAlchemy)
    # if engine and hasattr(engine, 'dispose'): # Check if using async engine that needs dispose
    #    await engine.dispose()

    logger.info("Application shutdown complete.")


# --- Include Routers ---
app.include_router(history.router)
app.include_router(bookmarks.router)
app.include_router(api_config.router)
app.include_router(websocket.router)
app.include_router(ui.router)

# Optional: Add a root endpoint for health check or basic info
@app.get("/health", tags=["service"])
async def health_check():
    # Extended health check could verify DB connection or task status
    db_ok = False
    try:
        with next(get_db()) as db:
            db.execute("SELECT 1")
            db_ok = True
    except Exception:
        db_ok = False

    return {
        "status": "ok",
        "database_connection": "ok" if db_ok else "error",
        # Add other checks as needed
    }