Files
browser-recall/app/main.py
2025-04-11 22:41:46 -05:00

293 lines
12 KiB
Python

from fastapi import FastAPI, Depends
from sqlalchemy.orm import Session
from datetime import datetime, timezone
from typing import Optional
import asyncio
from urllib.parse import urlparse
from fastapi.middleware.cors import CORSMiddleware
from fastapi.staticfiles import StaticFiles
import browser_history
from crawl4ai import AsyncWebCrawler
# Local imports
from .logging_config import setup_logger
from .database import (
get_db,
HistoryEntry,
get_last_processed_timestamp,
update_last_processed_timestamp,
create_tables,
engine,
# recreate_fts_tables # Keep if needed, but often done manually or via migration tool
)
from .config import Config
# Import Routers
from .routers import history, bookmarks, config as api_config, websocket, ui
logger = setup_logger(__name__)
# --- Global Variables ---
# These are accessed by other modules (like websocket router)
# Consider using app state or dependency injection for cleaner management if complexity grows
config_manager = Config() # Renamed to avoid conflict with router import
crawler: Optional[AsyncWebCrawler] = None
# Import scheduler *after* crawler is defined
from .scheduler import HistoryScheduler
scheduler: Optional[HistoryScheduler] = None # Now initialize scheduler variable
# --- FastAPI App Initialization ---
app = FastAPI(title="Browser History Search API")
# Add CORS middleware
app.add_middleware(
CORSMiddleware,
allow_origins=["*"], # Adjust in production
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
# Mount static files and templates
app.mount("/static", StaticFiles(directory="app/static"), name="static")
# Note: Templates are used within the ui router now, no need for global instance here unless used elsewhere
# --- Helper Function (Initial Sync) ---
def process_browser_history():
"""Fetches and stores new history entries from browser_history library (Initial Sync)."""
try:
logger.info("Starting browser history processing (initial sync)")
outputs = browser_history.get_history()
# browser_history returns platform specific History object, get histories list
history_list = []
if hasattr(outputs, 'histories') and outputs.histories:
history_list = outputs.histories # List of (datetime, url, title)
else:
logger.warning("Could not retrieve histories list from browser_history output.")
return # Exit if no history list found
logger.info(f"Found {len(history_list)} total history items from browser_history library")
current_timestamp_dt = datetime.now(timezone.utc)
current_timestamp = int(current_timestamp_dt.timestamp()) # Use timezone-aware timestamp
source_key = "browser_history_sync" # Differentiate from scheduler source
last_timestamp = get_last_processed_timestamp(source_key) or 0 # Ensure it's 0 if None
logger.info(f"Last processed timestamp for initial sync '{source_key}': {last_timestamp}")
new_entries = []
processed_urls_times = set() # Avoid duplicates within the batch
for entry in history_list:
# Basic validation of entry structure
if not isinstance(entry, (tuple, list)) or len(entry) < 2:
logger.warning(f"Skipping malformed history entry: {entry}")
continue
timestamp, url = entry[0], entry[1]
title = entry[2] if len(entry) > 2 else "" # Handle optional title
if not url or not timestamp:
logger.warning(f"Skipping entry with missing URL or timestamp: Title='{title}'")
continue
# Ensure timestamp is datetime object
if not isinstance(timestamp, datetime):
logger.warning(f"Skipping entry with non-datetime timestamp ({type(timestamp)}): {url}")
continue
# Normalize timestamp (Assume local if naive, convert to UTC)
if timestamp.tzinfo is None or timestamp.tzinfo.utcoffset(timestamp) is None:
try:
timestamp_aware = timestamp.astimezone() # Make aware using system local
except Exception as tz_err:
logger.warning(f"Could not determine local timezone for naive timestamp {timestamp}. Assuming UTC. Error: {tz_err}")
timestamp_aware = timestamp.replace(tzinfo=timezone.utc) # Fallback to UTC
else:
timestamp_aware = timestamp
timestamp_utc = timestamp_aware.astimezone(timezone.utc)
# Filter for only new entries based on normalized UTC timestamp
if timestamp_utc.timestamp() > last_timestamp:
entry_key = (url, timestamp_utc.timestamp())
if entry_key in processed_urls_times:
continue # Skip duplicate within this batch
new_entries.append((timestamp_utc, url, title))
processed_urls_times.add(entry_key)
logger.info(f"Found {len(new_entries)} new entries for initial sync after filtering")
if new_entries:
added_count = 0
skipped_ignored = 0
# Use context manager for session
with next(get_db()) as db:
try:
for timestamp_utc, url, title in new_entries:
domain = urlparse(url).netloc
if config_manager.is_domain_ignored(domain):
# logger.debug(f"Skipping ignored domain during initial sync: {domain}")
skipped_ignored += 1
continue
# Optional: Check if entry already exists more robustly
# existing = db.query(HistoryEntry.id).filter(HistoryEntry.url == url, HistoryEntry.visit_time == timestamp_utc).first()
# if existing:
# continue
history_entry = HistoryEntry(
url=url,
title=title or "", # Ensure title is not None
visit_time=timestamp_utc,
domain=domain
# Note: No markdown content here, only basic history
)
db.add(history_entry)
added_count += 1
if added_count > 0:
db.commit()
logger.info(f"Committed {added_count} new history entries from initial sync.")
# Update the last processed timestamp only if successful commit
update_last_processed_timestamp(source_key, current_timestamp)
logger.info(f"Updated initial sync timestamp for '{source_key}' to {current_timestamp}")
else:
logger.info("No new unique entries to commit during initial sync.")
# Update timestamp even if nothing new added, to mark sync time
update_last_processed_timestamp(source_key, current_timestamp)
logger.info(f"Updated initial sync timestamp check for '{source_key}' to {current_timestamp}")
if skipped_ignored > 0:
logger.info(f"Skipped {skipped_ignored} entries due to ignored domains during initial sync.")
except Exception as e:
logger.error(f"Error storing history item during initial sync: {str(e)}", exc_info=True)
db.rollback()
else:
logger.info("No new history entries found during initial sync.")
# Update timestamp even if nothing new found, to mark sync time
update_last_processed_timestamp(source_key, current_timestamp)
logger.info(f"Updated initial sync timestamp check for '{source_key}' to {current_timestamp}")
except ImportError:
logger.warning("`browser_history` library not found or import failed. Skipping initial sync.")
except Exception as e:
logger.error(f"Error processing browser history during initial sync: {str(e)}", exc_info=True)
# --- Startup and Shutdown Events ---
@app.on_event("startup")
async def startup_event():
global crawler, scheduler # Allow modification of globals
logger.info("Starting application initialization...")
try:
# 1. Ensure base tables exist
logger.info("Ensuring base tables exist...")
create_tables()
# 2. Initialize the crawler
logger.info("Initializing AsyncWebCrawler...")
if crawler is None:
crawler = AsyncWebCrawler()
logger.info("AsyncWebCrawler initialized.")
# 3. Initialize the Scheduler *after* the crawler
logger.info("Initializing HistoryScheduler...")
if scheduler is None:
scheduler = HistoryScheduler(crawler=crawler) # Pass crawler instance
logger.info("HistoryScheduler initialized.")
# 4. Perform initial history sync from browser_history library
logger.info("Performing initial browser history sync...")
process_browser_history() # Sync history not processed before
# 5. Perform initial bookmark sync (using scheduler's method)
# Run in background to avoid blocking startup if it takes long
logger.info("Starting initial bookmark sync task...")
asyncio.create_task(scheduler.update_bookmarks())
# 6. Start background tasks (scheduler for ongoing updates)
logger.info("Starting background history update task...")
asyncio.create_task(scheduler.update_history())
# --- Markdown Update Tasks ---
# 7a. Trigger ONE initial batch processing run in the background
logger.info("Starting initial markdown processing batch task...")
asyncio.create_task(scheduler._process_markdown_batch()) # Run one batch now
# 7b. Start the PERIODIC background markdown update task
logger.info("Starting periodic background markdown update task...")
# Use the renamed method for the loop
asyncio.create_task(scheduler.update_missing_markdown_periodically())
# --- End Markdown Update Tasks ---
logger.info("Application startup sequence initiated. Background tasks running.")
except Exception as e:
logger.error(f"FATAL ERROR during application startup: {str(e)}", exc_info=True)
raise RuntimeError(f"Application startup failed: {e}") from e
@app.on_event("shutdown")
async def shutdown_event():
global crawler, scheduler
logger.info("Starting application shutdown...")
# Stop scheduler tasks gracefully if possible (implement cancellation in tasks if needed)
# For now, we just close resources
# Close scheduler resources
if scheduler and hasattr(scheduler, 'close'):
try:
logger.info("Closing scheduler resources...")
await scheduler.close() # Call the scheduler's close method
except Exception as e:
logger.error(f"Error closing scheduler: {e}", exc_info=True)
# Close crawler if needed (check crawl4ai docs for explicit close method)
# Based on previous code, seems no explicit close needed, but keep check just in case
if crawler and hasattr(crawler, 'aclose'):
try:
logger.info("Closing AsyncWebCrawler...")
# await crawler.aclose() # Example if an async close exists
except Exception as e:
logger.error(f"Error closing crawler: {e}", exc_info=True)
# Close database engine connections if necessary (usually handled automatically by SQLAlchemy)
# if engine and hasattr(engine, 'dispose'): # Check if using async engine that needs dispose
# await engine.dispose()
logger.info("Application shutdown complete.")
# --- Include Routers ---
app.include_router(history.router)
app.include_router(bookmarks.router)
app.include_router(api_config.router)
app.include_router(websocket.router)
app.include_router(ui.router)
# Optional: Add a root endpoint for health check or basic info
@app.get("/health", tags=["service"])
async def health_check():
# Extended health check could verify DB connection or task status
db_ok = False
try:
with next(get_db()) as db:
db.execute("SELECT 1")
db_ok = True
except Exception:
db_ok = False
return {
"status": "ok",
"database_connection": "ok" if db_ok else "error",
# Add other checks as needed
}