mirror of
https://github.com/Zetaphor/browser-recall.git
synced 2025-12-06 02:19:37 +00:00
175 lines
7.4 KiB
Python
175 lines
7.4 KiB
Python
import asyncio
|
|
from fastapi import APIRouter, WebSocket, WebSocketDisconnect, Depends, HTTPException
|
|
from sqlalchemy.orm import Session
|
|
from datetime import datetime, timezone, timedelta
|
|
from urllib.parse import urlparse
|
|
import iso8601
|
|
|
|
# Import necessary components from other modules
|
|
from .. import main as app_main # To access global crawler instance
|
|
from ..database import get_db, HistoryEntry
|
|
from ..config import Config
|
|
from ..logging_config import setup_logger
|
|
|
|
logger = setup_logger(__name__)
|
|
router = APIRouter(tags=["websocket"])
|
|
config = Config() # Assuming config is okay as a separate instance here
|
|
|
|
@router.websocket("/ws")
|
|
async def websocket_endpoint(websocket: WebSocket, db: Session = Depends(get_db)):
|
|
# Access the global crawler instance from main.py
|
|
crawler = app_main.crawler
|
|
if not crawler:
|
|
logger.error("Crawler not initialized!")
|
|
await websocket.close(code=1011) # Internal Server Error
|
|
return
|
|
|
|
logger.info("New WebSocket connection established")
|
|
await websocket.accept()
|
|
try:
|
|
while True:
|
|
data = await websocket.receive_json()
|
|
|
|
# Validate incoming data structure (basic check)
|
|
if 'url' not in data or 'timestamp' not in data:
|
|
logger.warning("Received invalid WebSocket message format.")
|
|
await websocket.send_json({
|
|
"status": "error",
|
|
"message": "Invalid message format. 'url' and 'timestamp' required."
|
|
})
|
|
continue
|
|
|
|
url = data['url']
|
|
try:
|
|
timestamp = iso8601.parse_date(data['timestamp'])
|
|
except iso8601.ParseError:
|
|
logger.warning(f"Received invalid timestamp format: {data['timestamp']}")
|
|
await websocket.send_json({
|
|
"status": "error",
|
|
"message": f"Invalid timestamp format: {data['timestamp']}"
|
|
})
|
|
continue
|
|
|
|
# Parse the URL and check if domain should be ignored
|
|
try:
|
|
domain = urlparse(url).netloc
|
|
if not domain: # Handle invalid URLs
|
|
raise ValueError("Could not parse domain from URL")
|
|
except ValueError as e:
|
|
logger.warning(f"Could not parse URL: {url}. Error: {e}")
|
|
await websocket.send_json({"status": "error", "message": f"Invalid URL: {url}"})
|
|
continue
|
|
|
|
if config.is_domain_ignored(domain):
|
|
logger.info(f"Ignoring domain: {domain} for URL: {url}")
|
|
await websocket.send_json({
|
|
"status": "ignored",
|
|
"message": f"Domain {domain} is in ignore list"
|
|
})
|
|
continue
|
|
|
|
logger.info(f"Processing page via WebSocket: {url}")
|
|
|
|
# Check if we already have a recent entry for this URL
|
|
# Make timestamp timezone-aware (assuming UTC if naive)
|
|
if timestamp.tzinfo is None:
|
|
timestamp = timestamp.replace(tzinfo=timezone.utc)
|
|
else:
|
|
timestamp = timestamp.astimezone(timezone.utc)
|
|
|
|
recent_threshold = timestamp - timedelta(minutes=5)
|
|
existing_entry = db.query(HistoryEntry.id).filter(
|
|
HistoryEntry.url == url,
|
|
HistoryEntry.visit_time >= recent_threshold
|
|
).first() # Only fetch ID for efficiency
|
|
|
|
if existing_entry:
|
|
logger.info(f"Recent entry exists for URL: {url}")
|
|
await websocket.send_json({
|
|
"status": "skipped",
|
|
"message": "Recent entry exists"
|
|
})
|
|
continue
|
|
|
|
# --- Start crawl4ai processing ---
|
|
logger.info(f"Processing page with crawl4ai: {url}")
|
|
markdown_content = None
|
|
title = ''
|
|
try:
|
|
# Use the global crawler instance
|
|
crawl_result = await crawler.arun(url=url)
|
|
if crawl_result:
|
|
markdown_content = crawl_result.markdown
|
|
# Attempt to get title from metadata, fallback to empty string
|
|
title = getattr(crawl_result.metadata, 'title', '') or '' # Ensure title is string
|
|
if not title:
|
|
logger.warning(f"Could not extract title for {url} using crawl4ai.")
|
|
logger.info(f"crawl4ai processing complete. Markdown length: {len(markdown_content) if markdown_content else 0}, Title: '{title}'")
|
|
else:
|
|
logger.warning(f"crawl4ai returned None for URL: {url}")
|
|
markdown_content = "" # Ensure it's not None
|
|
title = ""
|
|
|
|
except Exception as crawl_error:
|
|
logger.error(f"crawl4ai failed for URL {url}: {crawl_error}", exc_info=True)
|
|
await websocket.send_json({
|
|
"status": "error",
|
|
"message": f"Failed to crawl page content: {str(crawl_error)}"
|
|
})
|
|
continue # Skip to next message
|
|
# --- End crawl4ai processing ---
|
|
|
|
# Only proceed if we got some content or at least a title
|
|
if not title and not markdown_content:
|
|
logger.info(f"No title or content extracted by crawl4ai from: {url}")
|
|
await websocket.send_json({
|
|
"status": "skipped",
|
|
"message": "No title or content extracted by crawl4ai"
|
|
})
|
|
continue
|
|
|
|
# Create history entry using data from crawl4ai
|
|
history_entry = HistoryEntry(
|
|
url=url,
|
|
title=title, # Use title from crawl4ai
|
|
visit_time=timestamp, # Use the parsed, timezone-aware timestamp
|
|
domain=domain,
|
|
markdown_content=markdown_content, # Use markdown from crawl4ai
|
|
last_content_update=datetime.now(timezone.utc)
|
|
)
|
|
|
|
logger.debug(f"Attempting to save entry for {url} with markdown length: {len(markdown_content) if markdown_content else 0}")
|
|
|
|
db.add(history_entry)
|
|
try:
|
|
db.commit()
|
|
logger.info(f"Successfully saved entry for: {url}")
|
|
await websocket.send_json({
|
|
"status": "success",
|
|
"message": f"Processed page: {url}"
|
|
})
|
|
except Exception as e:
|
|
db.rollback()
|
|
logger.error(f"Error saving entry for {url}: {e}", exc_info=True)
|
|
await websocket.send_json({
|
|
"status": "error",
|
|
"message": "Database error occurred while saving."
|
|
})
|
|
|
|
except WebSocketDisconnect:
|
|
logger.info("WebSocket client disconnected")
|
|
except Exception as e:
|
|
logger.error(f"Unhandled error in WebSocket handler: {e}", exc_info=True)
|
|
# Attempt to inform client before closing (might fail if connection is already broken)
|
|
try:
|
|
await websocket.send_json({
|
|
"status": "error",
|
|
"message": "An internal server error occurred."
|
|
})
|
|
except Exception:
|
|
pass # Ignore if sending fails
|
|
# Ensure connection is closed on server error
|
|
try:
|
|
await websocket.close(code=1011) # Internal Server Error
|
|
except Exception:
|
|
pass # Ignore if closing fails |