mirror of
https://github.com/Zetaphor/browser-recall.git
synced 2025-12-06 02:19:37 +00:00
293 lines
12 KiB
Python
293 lines
12 KiB
Python
from fastapi import FastAPI, Depends
|
|
from sqlalchemy.orm import Session
|
|
from datetime import datetime, timezone
|
|
from typing import Optional
|
|
import asyncio
|
|
from urllib.parse import urlparse
|
|
from fastapi.middleware.cors import CORSMiddleware
|
|
from fastapi.staticfiles import StaticFiles
|
|
import browser_history
|
|
from crawl4ai import AsyncWebCrawler
|
|
|
|
# Local imports
|
|
from .logging_config import setup_logger
|
|
from .database import (
|
|
get_db,
|
|
HistoryEntry,
|
|
get_last_processed_timestamp,
|
|
update_last_processed_timestamp,
|
|
create_tables,
|
|
engine,
|
|
# recreate_fts_tables # Keep if needed, but often done manually or via migration tool
|
|
)
|
|
from .config import Config
|
|
|
|
# Import Routers
|
|
from .routers import history, bookmarks, config as api_config, websocket, ui
|
|
|
|
logger = setup_logger(__name__)
|
|
|
|
# --- Global Variables ---
|
|
# These are accessed by other modules (like websocket router)
|
|
# Consider using app state or dependency injection for cleaner management if complexity grows
|
|
config_manager = Config() # Renamed to avoid conflict with router import
|
|
crawler: Optional[AsyncWebCrawler] = None
|
|
|
|
# Import scheduler *after* crawler is defined
|
|
from .scheduler import HistoryScheduler
|
|
scheduler: Optional[HistoryScheduler] = None # Now initialize scheduler variable
|
|
|
|
# --- FastAPI App Initialization ---
|
|
app = FastAPI(title="Browser History Search API")
|
|
|
|
# Add CORS middleware
|
|
app.add_middleware(
|
|
CORSMiddleware,
|
|
allow_origins=["*"], # Adjust in production
|
|
allow_credentials=True,
|
|
allow_methods=["*"],
|
|
allow_headers=["*"],
|
|
)
|
|
|
|
# Mount static files and templates
|
|
app.mount("/static", StaticFiles(directory="app/static"), name="static")
|
|
# Note: Templates are used within the ui router now, no need for global instance here unless used elsewhere
|
|
|
|
# --- Helper Function (Initial Sync) ---
|
|
def process_browser_history():
|
|
"""Fetches and stores new history entries from browser_history library (Initial Sync)."""
|
|
try:
|
|
logger.info("Starting browser history processing (initial sync)")
|
|
outputs = browser_history.get_history()
|
|
# browser_history returns platform specific History object, get histories list
|
|
history_list = []
|
|
if hasattr(outputs, 'histories') and outputs.histories:
|
|
history_list = outputs.histories # List of (datetime, url, title)
|
|
else:
|
|
logger.warning("Could not retrieve histories list from browser_history output.")
|
|
return # Exit if no history list found
|
|
|
|
logger.info(f"Found {len(history_list)} total history items from browser_history library")
|
|
|
|
current_timestamp_dt = datetime.now(timezone.utc)
|
|
current_timestamp = int(current_timestamp_dt.timestamp()) # Use timezone-aware timestamp
|
|
source_key = "browser_history_sync" # Differentiate from scheduler source
|
|
last_timestamp = get_last_processed_timestamp(source_key) or 0 # Ensure it's 0 if None
|
|
|
|
logger.info(f"Last processed timestamp for initial sync '{source_key}': {last_timestamp}")
|
|
|
|
new_entries = []
|
|
processed_urls_times = set() # Avoid duplicates within the batch
|
|
|
|
for entry in history_list:
|
|
# Basic validation of entry structure
|
|
if not isinstance(entry, (tuple, list)) or len(entry) < 2:
|
|
logger.warning(f"Skipping malformed history entry: {entry}")
|
|
continue
|
|
timestamp, url = entry[0], entry[1]
|
|
title = entry[2] if len(entry) > 2 else "" # Handle optional title
|
|
|
|
if not url or not timestamp:
|
|
logger.warning(f"Skipping entry with missing URL or timestamp: Title='{title}'")
|
|
continue
|
|
|
|
# Ensure timestamp is datetime object
|
|
if not isinstance(timestamp, datetime):
|
|
logger.warning(f"Skipping entry with non-datetime timestamp ({type(timestamp)}): {url}")
|
|
continue
|
|
|
|
# Normalize timestamp (Assume local if naive, convert to UTC)
|
|
if timestamp.tzinfo is None or timestamp.tzinfo.utcoffset(timestamp) is None:
|
|
try:
|
|
timestamp_aware = timestamp.astimezone() # Make aware using system local
|
|
except Exception as tz_err:
|
|
logger.warning(f"Could not determine local timezone for naive timestamp {timestamp}. Assuming UTC. Error: {tz_err}")
|
|
timestamp_aware = timestamp.replace(tzinfo=timezone.utc) # Fallback to UTC
|
|
else:
|
|
timestamp_aware = timestamp
|
|
timestamp_utc = timestamp_aware.astimezone(timezone.utc)
|
|
|
|
|
|
# Filter for only new entries based on normalized UTC timestamp
|
|
if timestamp_utc.timestamp() > last_timestamp:
|
|
entry_key = (url, timestamp_utc.timestamp())
|
|
if entry_key in processed_urls_times:
|
|
continue # Skip duplicate within this batch
|
|
|
|
new_entries.append((timestamp_utc, url, title))
|
|
processed_urls_times.add(entry_key)
|
|
|
|
logger.info(f"Found {len(new_entries)} new entries for initial sync after filtering")
|
|
|
|
if new_entries:
|
|
added_count = 0
|
|
skipped_ignored = 0
|
|
# Use context manager for session
|
|
with next(get_db()) as db:
|
|
try:
|
|
for timestamp_utc, url, title in new_entries:
|
|
domain = urlparse(url).netloc
|
|
if config_manager.is_domain_ignored(domain):
|
|
# logger.debug(f"Skipping ignored domain during initial sync: {domain}")
|
|
skipped_ignored += 1
|
|
continue
|
|
|
|
# Optional: Check if entry already exists more robustly
|
|
# existing = db.query(HistoryEntry.id).filter(HistoryEntry.url == url, HistoryEntry.visit_time == timestamp_utc).first()
|
|
# if existing:
|
|
# continue
|
|
|
|
history_entry = HistoryEntry(
|
|
url=url,
|
|
title=title or "", # Ensure title is not None
|
|
visit_time=timestamp_utc,
|
|
domain=domain
|
|
# Note: No markdown content here, only basic history
|
|
)
|
|
db.add(history_entry)
|
|
added_count += 1
|
|
|
|
if added_count > 0:
|
|
db.commit()
|
|
logger.info(f"Committed {added_count} new history entries from initial sync.")
|
|
# Update the last processed timestamp only if successful commit
|
|
update_last_processed_timestamp(source_key, current_timestamp)
|
|
logger.info(f"Updated initial sync timestamp for '{source_key}' to {current_timestamp}")
|
|
else:
|
|
logger.info("No new unique entries to commit during initial sync.")
|
|
# Update timestamp even if nothing new added, to mark sync time
|
|
update_last_processed_timestamp(source_key, current_timestamp)
|
|
logger.info(f"Updated initial sync timestamp check for '{source_key}' to {current_timestamp}")
|
|
|
|
|
|
if skipped_ignored > 0:
|
|
logger.info(f"Skipped {skipped_ignored} entries due to ignored domains during initial sync.")
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error storing history item during initial sync: {str(e)}", exc_info=True)
|
|
db.rollback()
|
|
else:
|
|
logger.info("No new history entries found during initial sync.")
|
|
# Update timestamp even if nothing new found, to mark sync time
|
|
update_last_processed_timestamp(source_key, current_timestamp)
|
|
logger.info(f"Updated initial sync timestamp check for '{source_key}' to {current_timestamp}")
|
|
|
|
|
|
except ImportError:
|
|
logger.warning("`browser_history` library not found or import failed. Skipping initial sync.")
|
|
except Exception as e:
|
|
logger.error(f"Error processing browser history during initial sync: {str(e)}", exc_info=True)
|
|
|
|
|
|
# --- Startup and Shutdown Events ---
|
|
@app.on_event("startup")
|
|
async def startup_event():
|
|
global crawler, scheduler # Allow modification of globals
|
|
logger.info("Starting application initialization...")
|
|
|
|
try:
|
|
# 1. Ensure base tables exist
|
|
logger.info("Ensuring base tables exist...")
|
|
create_tables()
|
|
|
|
# 2. Initialize the crawler
|
|
logger.info("Initializing AsyncWebCrawler...")
|
|
if crawler is None:
|
|
crawler = AsyncWebCrawler()
|
|
logger.info("AsyncWebCrawler initialized.")
|
|
|
|
# 3. Initialize the Scheduler *after* the crawler
|
|
logger.info("Initializing HistoryScheduler...")
|
|
if scheduler is None:
|
|
scheduler = HistoryScheduler(crawler=crawler) # Pass crawler instance
|
|
logger.info("HistoryScheduler initialized.")
|
|
|
|
# 4. Perform initial history sync from browser_history library
|
|
logger.info("Performing initial browser history sync...")
|
|
process_browser_history() # Sync history not processed before
|
|
|
|
# 5. Perform initial bookmark sync (using scheduler's method)
|
|
# Run in background to avoid blocking startup if it takes long
|
|
logger.info("Starting initial bookmark sync task...")
|
|
asyncio.create_task(scheduler.update_bookmarks())
|
|
|
|
# 6. Start background tasks (scheduler for ongoing updates)
|
|
logger.info("Starting background history update task...")
|
|
asyncio.create_task(scheduler.update_history())
|
|
|
|
# --- Markdown Update Tasks ---
|
|
# 7a. Trigger ONE initial batch processing run in the background
|
|
logger.info("Starting initial markdown processing batch task...")
|
|
asyncio.create_task(scheduler._process_markdown_batch()) # Run one batch now
|
|
|
|
# 7b. Start the PERIODIC background markdown update task
|
|
logger.info("Starting periodic background markdown update task...")
|
|
# Use the renamed method for the loop
|
|
asyncio.create_task(scheduler.update_missing_markdown_periodically())
|
|
# --- End Markdown Update Tasks ---
|
|
|
|
|
|
logger.info("Application startup sequence initiated. Background tasks running.")
|
|
|
|
except Exception as e:
|
|
logger.error(f"FATAL ERROR during application startup: {str(e)}", exc_info=True)
|
|
raise RuntimeError(f"Application startup failed: {e}") from e
|
|
|
|
|
|
@app.on_event("shutdown")
|
|
async def shutdown_event():
|
|
global crawler, scheduler
|
|
logger.info("Starting application shutdown...")
|
|
|
|
# Stop scheduler tasks gracefully if possible (implement cancellation in tasks if needed)
|
|
# For now, we just close resources
|
|
|
|
# Close scheduler resources
|
|
if scheduler and hasattr(scheduler, 'close'):
|
|
try:
|
|
logger.info("Closing scheduler resources...")
|
|
await scheduler.close() # Call the scheduler's close method
|
|
except Exception as e:
|
|
logger.error(f"Error closing scheduler: {e}", exc_info=True)
|
|
|
|
# Close crawler if needed (check crawl4ai docs for explicit close method)
|
|
# Based on previous code, seems no explicit close needed, but keep check just in case
|
|
if crawler and hasattr(crawler, 'aclose'):
|
|
try:
|
|
logger.info("Closing AsyncWebCrawler...")
|
|
# await crawler.aclose() # Example if an async close exists
|
|
except Exception as e:
|
|
logger.error(f"Error closing crawler: {e}", exc_info=True)
|
|
|
|
|
|
# Close database engine connections if necessary (usually handled automatically by SQLAlchemy)
|
|
# if engine and hasattr(engine, 'dispose'): # Check if using async engine that needs dispose
|
|
# await engine.dispose()
|
|
|
|
logger.info("Application shutdown complete.")
|
|
|
|
|
|
# --- Include Routers ---
|
|
app.include_router(history.router)
|
|
app.include_router(bookmarks.router)
|
|
app.include_router(api_config.router)
|
|
app.include_router(websocket.router)
|
|
app.include_router(ui.router)
|
|
|
|
# Optional: Add a root endpoint for health check or basic info
|
|
@app.get("/health", tags=["service"])
|
|
async def health_check():
|
|
# Extended health check could verify DB connection or task status
|
|
db_ok = False
|
|
try:
|
|
with next(get_db()) as db:
|
|
db.execute("SELECT 1")
|
|
db_ok = True
|
|
except Exception:
|
|
db_ok = False
|
|
|
|
return {
|
|
"status": "ok",
|
|
"database_connection": "ok" if db_ok else "error",
|
|
# Add other checks as needed
|
|
} |