Files
browser-recall/app/routers/websocket.py
2025-04-11 22:41:46 -05:00

175 lines
7.4 KiB
Python

import asyncio
from fastapi import APIRouter, WebSocket, WebSocketDisconnect, Depends, HTTPException
from sqlalchemy.orm import Session
from datetime import datetime, timezone, timedelta
from urllib.parse import urlparse
import iso8601
# Import necessary components from other modules
from .. import main as app_main # To access global crawler instance
from ..database import get_db, HistoryEntry
from ..config import Config
from ..logging_config import setup_logger
logger = setup_logger(__name__)
router = APIRouter(tags=["websocket"])
config = Config() # Assuming config is okay as a separate instance here
@router.websocket("/ws")
async def websocket_endpoint(websocket: WebSocket, db: Session = Depends(get_db)):
# Access the global crawler instance from main.py
crawler = app_main.crawler
if not crawler:
logger.error("Crawler not initialized!")
await websocket.close(code=1011) # Internal Server Error
return
logger.info("New WebSocket connection established")
await websocket.accept()
try:
while True:
data = await websocket.receive_json()
# Validate incoming data structure (basic check)
if 'url' not in data or 'timestamp' not in data:
logger.warning("Received invalid WebSocket message format.")
await websocket.send_json({
"status": "error",
"message": "Invalid message format. 'url' and 'timestamp' required."
})
continue
url = data['url']
try:
timestamp = iso8601.parse_date(data['timestamp'])
except iso8601.ParseError:
logger.warning(f"Received invalid timestamp format: {data['timestamp']}")
await websocket.send_json({
"status": "error",
"message": f"Invalid timestamp format: {data['timestamp']}"
})
continue
# Parse the URL and check if domain should be ignored
try:
domain = urlparse(url).netloc
if not domain: # Handle invalid URLs
raise ValueError("Could not parse domain from URL")
except ValueError as e:
logger.warning(f"Could not parse URL: {url}. Error: {e}")
await websocket.send_json({"status": "error", "message": f"Invalid URL: {url}"})
continue
if config.is_domain_ignored(domain):
logger.info(f"Ignoring domain: {domain} for URL: {url}")
await websocket.send_json({
"status": "ignored",
"message": f"Domain {domain} is in ignore list"
})
continue
logger.info(f"Processing page via WebSocket: {url}")
# Check if we already have a recent entry for this URL
# Make timestamp timezone-aware (assuming UTC if naive)
if timestamp.tzinfo is None:
timestamp = timestamp.replace(tzinfo=timezone.utc)
else:
timestamp = timestamp.astimezone(timezone.utc)
recent_threshold = timestamp - timedelta(minutes=5)
existing_entry = db.query(HistoryEntry.id).filter(
HistoryEntry.url == url,
HistoryEntry.visit_time >= recent_threshold
).first() # Only fetch ID for efficiency
if existing_entry:
logger.info(f"Recent entry exists for URL: {url}")
await websocket.send_json({
"status": "skipped",
"message": "Recent entry exists"
})
continue
# --- Start crawl4ai processing ---
logger.info(f"Processing page with crawl4ai: {url}")
markdown_content = None
title = ''
try:
# Use the global crawler instance
crawl_result = await crawler.arun(url=url)
if crawl_result:
markdown_content = crawl_result.markdown
# Attempt to get title from metadata, fallback to empty string
title = getattr(crawl_result.metadata, 'title', '') or '' # Ensure title is string
if not title:
logger.warning(f"Could not extract title for {url} using crawl4ai.")
logger.info(f"crawl4ai processing complete. Markdown length: {len(markdown_content) if markdown_content else 0}, Title: '{title}'")
else:
logger.warning(f"crawl4ai returned None for URL: {url}")
markdown_content = "" # Ensure it's not None
title = ""
except Exception as crawl_error:
logger.error(f"crawl4ai failed for URL {url}: {crawl_error}", exc_info=True)
await websocket.send_json({
"status": "error",
"message": f"Failed to crawl page content: {str(crawl_error)}"
})
continue # Skip to next message
# --- End crawl4ai processing ---
# Only proceed if we got some content or at least a title
if not title and not markdown_content:
logger.info(f"No title or content extracted by crawl4ai from: {url}")
await websocket.send_json({
"status": "skipped",
"message": "No title or content extracted by crawl4ai"
})
continue
# Create history entry using data from crawl4ai
history_entry = HistoryEntry(
url=url,
title=title, # Use title from crawl4ai
visit_time=timestamp, # Use the parsed, timezone-aware timestamp
domain=domain,
markdown_content=markdown_content, # Use markdown from crawl4ai
last_content_update=datetime.now(timezone.utc)
)
logger.debug(f"Attempting to save entry for {url} with markdown length: {len(markdown_content) if markdown_content else 0}")
db.add(history_entry)
try:
db.commit()
logger.info(f"Successfully saved entry for: {url}")
await websocket.send_json({
"status": "success",
"message": f"Processed page: {url}"
})
except Exception as e:
db.rollback()
logger.error(f"Error saving entry for {url}: {e}", exc_info=True)
await websocket.send_json({
"status": "error",
"message": "Database error occurred while saving."
})
except WebSocketDisconnect:
logger.info("WebSocket client disconnected")
except Exception as e:
logger.error(f"Unhandled error in WebSocket handler: {e}", exc_info=True)
# Attempt to inform client before closing (might fail if connection is already broken)
try:
await websocket.send_json({
"status": "error",
"message": "An internal server error occurred."
})
except Exception:
pass # Ignore if sending fails
# Ensure connection is closed on server error
try:
await websocket.close(code=1011) # Internal Server Error
except Exception:
pass # Ignore if closing fails