browser-recall/app/routers/websocket.py

import asyncio
from fastapi import APIRouter, WebSocket, WebSocketDisconnect, Depends, HTTPException
from sqlalchemy.orm import Session
from datetime import datetime, timezone, timedelta
from urllib.parse import urlparse
import iso8601

# Import necessary components from other modules
from .. import main as app_main # To access global crawler instance
from ..database import get_db, HistoryEntry
from ..config import Config
from ..logging_config import setup_logger

logger = setup_logger(__name__)
router = APIRouter(tags=["websocket"])
config = Config() # Assuming config is okay as a separate instance here

@router.websocket("/ws")
async def websocket_endpoint(websocket: WebSocket, db: Session = Depends(get_db)):
    # Access the global crawler instance from main.py
    crawler = app_main.crawler
    if not crawler:
        logger.error("Crawler not initialized!")
        await websocket.close(code=1011) # Internal Server Error
        return

    logger.info("New WebSocket connection established")
    await websocket.accept()
    try:
        while True:
            data = await websocket.receive_json()

            # Validate incoming data structure (basic check)
            if 'url' not in data or 'timestamp' not in data:
                logger.warning("Received invalid WebSocket message format.")
                await websocket.send_json({
                    "status": "error",
                    "message": "Invalid message format. 'url' and 'timestamp' required."
                })
                continue

            url = data['url']
            try:
                timestamp = iso8601.parse_date(data['timestamp'])
            except iso8601.ParseError:
                logger.warning(f"Received invalid timestamp format: {data['timestamp']}")
                await websocket.send_json({
                    "status": "error",
                    "message": f"Invalid timestamp format: {data['timestamp']}"
                })
                continue

            # Parse the URL and check if domain should be ignored
            try:
                domain = urlparse(url).netloc
                if not domain: # Handle invalid URLs
                     raise ValueError("Could not parse domain from URL")
            except ValueError as e:
                 logger.warning(f"Could not parse URL: {url}. Error: {e}")
                 await websocket.send_json({"status": "error", "message": f"Invalid URL: {url}"})
                 continue

            if config.is_domain_ignored(domain):
                logger.info(f"Ignoring domain: {domain} for URL: {url}")
                await websocket.send_json({
                    "status": "ignored",
                    "message": f"Domain {domain} is in ignore list"
                })
                continue

            logger.info(f"Processing page via WebSocket: {url}")

            # Check if we already have a recent entry for this URL
            # Make timestamp timezone-aware (assuming UTC if naive)
            if timestamp.tzinfo is None:
                timestamp = timestamp.replace(tzinfo=timezone.utc)
            else:
                timestamp = timestamp.astimezone(timezone.utc)

            recent_threshold = timestamp - timedelta(minutes=5)
            existing_entry = db.query(HistoryEntry.id).filter(
                HistoryEntry.url == url,
                HistoryEntry.visit_time >= recent_threshold
            ).first() # Only fetch ID for efficiency

            if existing_entry:
                logger.info(f"Recent entry exists for URL: {url}")
                await websocket.send_json({
                    "status": "skipped",
                    "message": "Recent entry exists"
                })
                continue

            # --- Start crawl4ai processing ---
            logger.info(f"Processing page with crawl4ai: {url}")
            markdown_content = None
            title = ''
            try:
                # Use the global crawler instance
                crawl_result = await crawler.arun(url=url)
                if crawl_result:
                    markdown_content = crawl_result.markdown
                    # Attempt to get title from metadata, fallback to empty string
                    title = getattr(crawl_result.metadata, 'title', '') or '' # Ensure title is string
                    if not title:
                        logger.warning(f"Could not extract title for {url} using crawl4ai.")
                    logger.info(f"crawl4ai processing complete. Markdown length: {len(markdown_content) if markdown_content else 0}, Title: '{title}'")
                else:
                    logger.warning(f"crawl4ai returned None for URL: {url}")
                    markdown_content = "" # Ensure it's not None
                    title = ""

            except Exception as crawl_error:
                logger.error(f"crawl4ai failed for URL {url}: {crawl_error}", exc_info=True)
                await websocket.send_json({
                    "status": "error",
                    "message": f"Failed to crawl page content: {str(crawl_error)}"
                })
                continue # Skip to next message
            # --- End crawl4ai processing ---

            # Only proceed if we got some content or at least a title
            if not title and not markdown_content:
                logger.info(f"No title or content extracted by crawl4ai from: {url}")
                await websocket.send_json({
                    "status": "skipped",
                    "message": "No title or content extracted by crawl4ai"
                })
                continue

            # Create history entry using data from crawl4ai
            history_entry = HistoryEntry(
                url=url,
                title=title, # Use title from crawl4ai
                visit_time=timestamp, # Use the parsed, timezone-aware timestamp
                domain=domain,
                markdown_content=markdown_content, # Use markdown from crawl4ai
                last_content_update=datetime.now(timezone.utc)
            )

            logger.debug(f"Attempting to save entry for {url} with markdown length: {len(markdown_content) if markdown_content else 0}")

            db.add(history_entry)
            try:
                db.commit()
                logger.info(f"Successfully saved entry for: {url}")
                await websocket.send_json({
                    "status": "success",
                    "message": f"Processed page: {url}"
                })
            except Exception as e:
                db.rollback()
                logger.error(f"Error saving entry for {url}: {e}", exc_info=True)
                await websocket.send_json({
                    "status": "error",
                    "message": "Database error occurred while saving."
                })

    except WebSocketDisconnect:
        logger.info("WebSocket client disconnected")
    except Exception as e:
        logger.error(f"Unhandled error in WebSocket handler: {e}", exc_info=True)
        # Attempt to inform client before closing (might fail if connection is already broken)
        try:
            await websocket.send_json({
                "status": "error",
                "message": "An internal server error occurred."
            })
        except Exception:
            pass # Ignore if sending fails
        # Ensure connection is closed on server error
        try:
            await websocket.close(code=1011) # Internal Server Error
        except Exception:
            pass # Ignore if closing fails