Refactor to use crawl4ai, uv

2025-12-06 10:29:38 +00:00 · 2025-04-11 22:41:46 -05:00
parent 80516440d7
commit 75a2c51b94
14 changed files with 3559 additions and 648 deletions
--- a/app/routers/bookmarks.py
+++ b/app/routers/bookmarks.py
@@ -0,0 +1,47 @@
+from fastapi import APIRouter, Depends, Query, HTTPException
+from sqlalchemy.orm import Session
+from typing import List, Optional
+
+from ..database import get_db, Bookmark
+from ..utils import serialize_bookmark
+from ..logging_config import setup_logger
+
+logger = setup_logger(__name__)
+router = APIRouter(prefix="/bookmarks", tags=["bookmarks"])
+
+@router.get("/search")
+async def search_bookmarks(
+    domain: Optional[str] = Query(None),
+    folder: Optional[str] = Query(None),
+    search_term: Optional[str] = Query(None),
+    db: Session = Depends(get_db)
+):
+    """Search bookmarks with optimized queries"""
+    try:
+        # Build query efficiently
+        query = db.query(Bookmark)
+
+        # Apply filters using index-optimized queries
+        if domain:
+            query = query.filter(Bookmark.domain == domain)
+
+        if folder:
+            query = query.filter(Bookmark.folder == folder)
+
+        if search_term:
+            # Use LIKE for title search (consider FTS for bookmarks if needed)
+            search_pattern = f"%{search_term}%"
+            query = query.filter(Bookmark.title.ilike(search_pattern))
+            # Removed index hint as SQLAlchemy/SQLite usually handles this well with LIKE
+
+        # Add ordering and limit for better performance
+        bookmarks = query.order_by(Bookmark.added_time.desc()).limit(1000).all()
+
+        return [serialize_bookmark(bookmark) for bookmark in bookmarks]
+
+    except Exception as e:
+        logger.error(f"Bookmark search error: {e}", exc_info=True)
+        raise HTTPException(
+            status_code=500,
+            detail={"message": "Bookmark search operation failed", "error": str(e)}
+        )
--- a/app/routers/config.py
+++ b/app/routers/config.py
@@ -0,0 +1,43 @@
+from fastapi import APIRouter, Depends, HTTPException
+from typing import List
+
+from ..config import Config
+from ..logging_config import setup_logger
+
+logger = setup_logger(__name__)
+router = APIRouter(prefix="/config", tags=["config"])
+
+# Assuming config is a singleton or easily accessible
+# If not, you might need to use Depends or app state
+config = Config()
+
+@router.get("/ignored-domains")
+async def get_ignored_domains():
+    """Get list of ignored domain patterns"""
+    try:
+        return {"ignored_domains": config.config.get('ignored_domains', [])}
+    except Exception as e:
+        logger.error(f"Error getting ignored domains: {e}", exc_info=True)
+        raise HTTPException(status_code=500, detail="Failed to retrieve ignored domains")
+
+
+@router.post("/ignored-domains")
+async def add_ignored_domain(pattern: str):
+    """Add a new domain pattern to ignored list"""
+    try:
+        config.add_ignored_domain(pattern)
+        return {"status": "success", "message": f"Added pattern: {pattern}"}
+    except Exception as e:
+        logger.error(f"Error adding ignored domain '{pattern}': {e}", exc_info=True)
+        raise HTTPException(status_code=500, detail="Failed to add ignored domain")
+
+
+@router.delete("/ignored-domains/{pattern}")
+async def remove_ignored_domain(pattern: str):
+    """Remove a domain pattern from ignored list"""
+    try:
+        config.remove_ignored_domain(pattern)
+        return {"status": "success", "message": f"Removed pattern: {pattern}"}
+    except Exception as e:
+        logger.error(f"Error removing ignored domain '{pattern}': {e}", exc_info=True)
+        raise HTTPException(status_code=500, detail="Failed to remove ignored domain")
--- a/app/routers/history.py
+++ b/app/routers/history.py
@@ -0,0 +1,132 @@
+from fastapi import APIRouter, Depends, Query, HTTPException
+from sqlalchemy.orm import Session
+from sqlalchemy import text
+from typing import List, Optional
+
+from ..database import get_db, HistoryEntry
+from ..utils import serialize_history_entry
+from ..logging_config import setup_logger
+
+logger = setup_logger(__name__)
+router = APIRouter(prefix="/history", tags=["history"])
+
+@router.get("/search")
+async def search_history(
+    query: Optional[str] = Query(None),
+    domain: Optional[str] = Query(None),
+    start_date: Optional[str] = Query(None),
+    end_date: Optional[str] = Query(None),
+    include_content: bool = Query(False),
+    db: Session = Depends(get_db)
+):
+    """Search history using FTS5"""
+    try:
+        if query:
+            # Build the FTS query
+            # Basic query sanitization/escaping might be needed depending on FTS syntax usage
+            # For simple term search, this is okay. For complex FTS syntax, more care is needed.
+            fts_conditions = []
+            params = {}
+
+            # Handle different query parts (title, content, domain)
+            # Example: "term1 title:term2 domain:example.com"
+            # This requires more sophisticated parsing. For now, assume simple query applies to title/content.
+            # A safer approach for user input:
+            sanitized_query = query.replace('"', '""') # Basic FTS escaping for quotes
+            fts_match_expr = f'(title : "{sanitized_query}"* OR markdown_content : "{sanitized_query}"*)'
+            params['fts_query'] = fts_match_expr
+
+            if domain:
+                # Add domain filtering directly in FTS if possible and indexed
+                # Assuming 'domain' is an indexed column in FTS table
+                # params['fts_query'] += f' AND domain : "{domain}"' # Adjust FTS syntax if needed
+                # Or filter after FTS search if domain isn't in FTS index efficiently
+                 pass # Domain filtering will be added later if needed
+
+            # Build the SQL query
+            sql = """
+                SELECT
+                    h.*,
+                    bm25(history_fts) as rank,
+                    highlight(history_fts, 0, '<mark>', '</mark>') as title_highlight,
+                    highlight(history_fts, 1, '<mark>', '</mark>') as content_highlight
+                FROM history_fts
+                JOIN history h ON history_fts.rowid = h.id
+                WHERE history_fts MATCH :fts_query
+            """
+
+            # Add domain filter as a regular WHERE clause if not in FTS MATCH
+            if domain:
+                sql += " AND h.domain = :domain"
+                params['domain'] = domain
+
+            # Add date filters if provided
+            if start_date:
+                sql += " AND h.visit_time >= :start_date"
+                params['start_date'] = start_date
+            if end_date:
+                sql += " AND h.visit_time <= :end_date"
+                params['end_date'] = end_date
+
+            sql += " ORDER BY rank DESC, h.visit_time DESC LIMIT 100" # Rank usually descends
+
+            results = db.execute(text(sql), params).fetchall()
+            # Use the updated serializer that handles potential highlight/rank fields
+            return [serialize_history_entry(row, include_content) for row in results]
+
+        else:
+            # Handle non-search queries (basic filtering)
+            query_builder = db.query(HistoryEntry)
+
+            if domain:
+                query_builder = query_builder.filter(HistoryEntry.domain == domain)
+            if start_date:
+                query_builder = query_builder.filter(HistoryEntry.visit_time >= start_date)
+            if end_date:
+                query_builder = query_builder.filter(HistoryEntry.visit_time <= end_date)
+
+            entries = query_builder.order_by(HistoryEntry.visit_time.desc()).limit(100).all()
+            return [serialize_history_entry(entry, include_content) for entry in entries]
+
+    except Exception as e:
+        logger.error(f"Search error: {str(e)}", exc_info=True)
+        raise HTTPException(
+            status_code=500,
+            detail={"message": "Search operation failed", "error": str(e)}
+        )
+
+
+@router.get("/search/advanced")
+async def advanced_history_search(
+    query: str = Query(..., description="Full-text search query with SQLite FTS5 syntax"),
+    include_content: bool = Query(False),
+    db: Session = Depends(get_db)
+):
+    """Advanced full-text search using SQLite FTS5 features"""
+    try:
+        # Use raw SQL for advanced FTS query
+        # Add rank and highlights here as well
+        fts_query = """
+            SELECT
+                h.*,
+                bm25(history_fts) as rank,
+                highlight(history_fts, 0, '<mark>', '</mark>') as title_highlight,
+                highlight(history_fts, 1, '<mark>', '</mark>') as content_highlight
+            FROM history_fts
+            JOIN history h ON history_fts.rowid = h.id
+            WHERE history_fts MATCH :query
+            ORDER BY rank DESC, h.visit_time DESC
+            LIMIT 1000
+        """
+
+        results = db.execute(text(fts_query), {'query': query}).fetchall()
+
+        # Use the updated serializer
+        return [serialize_history_entry(row, include_content) for row in results]
+
+    except Exception as e:
+        logger.error(f"Advanced search error: {e}", exc_info=True)
+        raise HTTPException(
+            status_code=500,
+            detail={"message": "Advanced search operation failed", "error": str(e)}
+        )
--- a/app/routers/ui.py
+++ b/app/routers/ui.py
@@ -0,0 +1,52 @@
+from fastapi import APIRouter, Depends, Request
+from fastapi.templating import Jinja2Templates
+from sqlalchemy.orm import Session
+
+from ..database import get_db, HistoryEntry, Bookmark
+from ..logging_config import setup_logger
+
+logger = setup_logger(__name__)
+router = APIRouter(tags=["ui"])
+templates = Jinja2Templates(directory="app/templates")
+
+@router.get("/")
+async def home(request: Request, db: Session = Depends(get_db)):
+    try:
+        # Get recent history entries
+        entries = db.query(HistoryEntry)\
+            .order_by(HistoryEntry.visit_time.desc())\
+            .limit(50)\
+            .all()
+        return templates.TemplateResponse(
+            "index.html",
+            {"request": request, "entries": entries}
+        )
+    except Exception as e:
+        logger.error(f"Error loading home page: {e}", exc_info=True)
+        # Optionally return an error template
+        return templates.TemplateResponse("error.html", {"request": request, "detail": "Could not load history"})
+
+
+@router.get("/search")
+async def search_page(request: Request):
+    return templates.TemplateResponse(
+        "search.html",
+        {"request": request}
+    )
+
+
+@router.get("/bookmarks")
+async def bookmarks_page(request: Request, db: Session = Depends(get_db)):
+    try:
+        bookmarks = db.query(Bookmark)\
+            .order_by(Bookmark.added_time.desc())\
+            .limit(50)\
+            .all()
+        return templates.TemplateResponse(
+            "bookmarks.html",
+            {"request": request, "bookmarks": bookmarks}
+        )
+    except Exception as e:
+        logger.error(f"Error loading bookmarks page: {e}", exc_info=True)
+        # Optionally return an error template
+        return templates.TemplateResponse("error.html", {"request": request, "detail": "Could not load bookmarks"})
--- a/app/routers/websocket.py
+++ b/app/routers/websocket.py
@@ -0,0 +1,175 @@
+import asyncio
+from fastapi import APIRouter, WebSocket, WebSocketDisconnect, Depends, HTTPException
+from sqlalchemy.orm import Session
+from datetime import datetime, timezone, timedelta
+from urllib.parse import urlparse
+import iso8601
+
+# Import necessary components from other modules
+from .. import main as app_main # To access global crawler instance
+from ..database import get_db, HistoryEntry
+from ..config import Config
+from ..logging_config import setup_logger
+
+logger = setup_logger(__name__)
+router = APIRouter(tags=["websocket"])
+config = Config() # Assuming config is okay as a separate instance here
+
+@router.websocket("/ws")
+async def websocket_endpoint(websocket: WebSocket, db: Session = Depends(get_db)):
+    # Access the global crawler instance from main.py
+    crawler = app_main.crawler
+    if not crawler:
+        logger.error("Crawler not initialized!")
+        await websocket.close(code=1011) # Internal Server Error
+        return
+
+    logger.info("New WebSocket connection established")
+    await websocket.accept()
+    try:
+        while True:
+            data = await websocket.receive_json()
+
+            # Validate incoming data structure (basic check)
+            if 'url' not in data or 'timestamp' not in data:
+                logger.warning("Received invalid WebSocket message format.")
+                await websocket.send_json({
+                    "status": "error",
+                    "message": "Invalid message format. 'url' and 'timestamp' required."
+                })
+                continue
+
+            url = data['url']
+            try:
+                timestamp = iso8601.parse_date(data['timestamp'])
+            except iso8601.ParseError:
+                logger.warning(f"Received invalid timestamp format: {data['timestamp']}")
+                await websocket.send_json({
+                    "status": "error",
+                    "message": f"Invalid timestamp format: {data['timestamp']}"
+                })
+                continue
+
+            # Parse the URL and check if domain should be ignored
+            try:
+                domain = urlparse(url).netloc
+                if not domain: # Handle invalid URLs
+                     raise ValueError("Could not parse domain from URL")
+            except ValueError as e:
+                 logger.warning(f"Could not parse URL: {url}. Error: {e}")
+                 await websocket.send_json({"status": "error", "message": f"Invalid URL: {url}"})
+                 continue
+
+            if config.is_domain_ignored(domain):
+                logger.info(f"Ignoring domain: {domain} for URL: {url}")
+                await websocket.send_json({
+                    "status": "ignored",
+                    "message": f"Domain {domain} is in ignore list"
+                })
+                continue
+
+            logger.info(f"Processing page via WebSocket: {url}")
+
+            # Check if we already have a recent entry for this URL
+            # Make timestamp timezone-aware (assuming UTC if naive)
+            if timestamp.tzinfo is None:
+                timestamp = timestamp.replace(tzinfo=timezone.utc)
+            else:
+                timestamp = timestamp.astimezone(timezone.utc)
+
+            recent_threshold = timestamp - timedelta(minutes=5)
+            existing_entry = db.query(HistoryEntry.id).filter(
+                HistoryEntry.url == url,
+                HistoryEntry.visit_time >= recent_threshold
+            ).first() # Only fetch ID for efficiency
+
+            if existing_entry:
+                logger.info(f"Recent entry exists for URL: {url}")
+                await websocket.send_json({
+                    "status": "skipped",
+                    "message": "Recent entry exists"
+                })
+                continue
+
+            # --- Start crawl4ai processing ---
+            logger.info(f"Processing page with crawl4ai: {url}")
+            markdown_content = None
+            title = ''
+            try:
+                # Use the global crawler instance
+                crawl_result = await crawler.arun(url=url)
+                if crawl_result:
+                    markdown_content = crawl_result.markdown
+                    # Attempt to get title from metadata, fallback to empty string
+                    title = getattr(crawl_result.metadata, 'title', '') or '' # Ensure title is string
+                    if not title:
+                        logger.warning(f"Could not extract title for {url} using crawl4ai.")
+                    logger.info(f"crawl4ai processing complete. Markdown length: {len(markdown_content) if markdown_content else 0}, Title: '{title}'")
+                else:
+                    logger.warning(f"crawl4ai returned None for URL: {url}")
+                    markdown_content = "" # Ensure it's not None
+                    title = ""
+
+            except Exception as crawl_error:
+                logger.error(f"crawl4ai failed for URL {url}: {crawl_error}", exc_info=True)
+                await websocket.send_json({
+                    "status": "error",
+                    "message": f"Failed to crawl page content: {str(crawl_error)}"
+                })
+                continue # Skip to next message
+            # --- End crawl4ai processing ---
+
+            # Only proceed if we got some content or at least a title
+            if not title and not markdown_content:
+                logger.info(f"No title or content extracted by crawl4ai from: {url}")
+                await websocket.send_json({
+                    "status": "skipped",
+                    "message": "No title or content extracted by crawl4ai"
+                })
+                continue
+
+            # Create history entry using data from crawl4ai
+            history_entry = HistoryEntry(
+                url=url,
+                title=title, # Use title from crawl4ai
+                visit_time=timestamp, # Use the parsed, timezone-aware timestamp
+                domain=domain,
+                markdown_content=markdown_content, # Use markdown from crawl4ai
+                last_content_update=datetime.now(timezone.utc)
+            )
+
+            logger.debug(f"Attempting to save entry for {url} with markdown length: {len(markdown_content) if markdown_content else 0}")
+
+            db.add(history_entry)
+            try:
+                db.commit()
+                logger.info(f"Successfully saved entry for: {url}")
+                await websocket.send_json({
+                    "status": "success",
+                    "message": f"Processed page: {url}"
+                })
+            except Exception as e:
+                db.rollback()
+                logger.error(f"Error saving entry for {url}: {e}", exc_info=True)
+                await websocket.send_json({
+                    "status": "error",
+                    "message": "Database error occurred while saving."
+                })
+
+    except WebSocketDisconnect:
+        logger.info("WebSocket client disconnected")
+    except Exception as e:
+        logger.error(f"Unhandled error in WebSocket handler: {e}", exc_info=True)
+        # Attempt to inform client before closing (might fail if connection is already broken)
+        try:
+            await websocket.send_json({
+                "status": "error",
+                "message": "An internal server error occurred."
+            })
+        except Exception:
+            pass # Ignore if sending fails
+        # Ensure connection is closed on server error
+        try:
+            await websocket.close(code=1011) # Internal Server Error
+        except Exception:
+            pass # Ignore if closing fails