Refactor to use crawl4ai, uv

This commit is contained in:
2025-04-11 22:41:46 -05:00
parent 80516440d7
commit 75a2c51b94
14 changed files with 3559 additions and 648 deletions

View File

@@ -1 +1 @@
3.10.6 3.10.16

View File

@@ -2,6 +2,10 @@ import yaml
from pathlib import Path from pathlib import Path
from typing import Set from typing import Set
import fnmatch import fnmatch
import os
import logging
logger = logging.getLogger(__name__)
class Config: class Config:
def __init__(self): def __init__(self):
@@ -128,3 +132,120 @@ class ReaderConfig:
return False return False
return True return True
DEFAULT_CONFIG_PATH = 'config/reader_config.yaml'
USER_CONFIG_DIR = os.path.expanduser("~/.config/browser-recall")
USER_CONFIG_PATH = os.path.join(USER_CONFIG_DIR, 'reader_config.yaml')
class Config:
_instance = None
def __new__(cls, *args, **kwargs):
if not cls._instance:
cls._instance = super(Config, cls).__new__(cls)
cls._instance._initialized = False
return cls._instance
def __init__(self, config_path=None):
if self._initialized:
return
self._initialized = True
self.config_path = self._determine_config_path(config_path)
self.config_data = self._load_config()
logger.info(f"Config initialized using: {self.config_path}")
# Pre-process excluded domains for faster lookup if needed,
# but direct iteration with fnmatch is often fine for moderate lists.
self.excluded_domains = self.config_data.get('excluded_domains', [])
# Ensure it's a list
if not isinstance(self.excluded_domains, list):
logger.warning(f"Excluded domains in config is not a list: {self.excluded_domains}. Ignoring.")
self.excluded_domains = []
def _determine_config_path(self, provided_path):
"""Determine the correct config path to use."""
if provided_path and os.path.exists(provided_path):
return provided_path
if os.path.exists(USER_CONFIG_PATH):
return USER_CONFIG_PATH
if os.path.exists(DEFAULT_CONFIG_PATH):
return DEFAULT_CONFIG_PATH
logger.warning("No configuration file found at default or user locations. Using empty config.")
return None # Indicate no file was found
def _load_config(self):
"""Loads the YAML configuration file."""
if not self.config_path:
return {} # Return empty dict if no config file path determined
try:
with open(self.config_path, 'r') as f:
return yaml.safe_load(f) or {} # Return empty dict if file is empty
except FileNotFoundError:
logger.warning(f"Configuration file not found at {self.config_path}. Using default settings.")
return {}
except yaml.YAMLError as e:
logger.error(f"Error parsing configuration file {self.config_path}: {e}")
return {} # Return empty dict on parsing error
except Exception as e:
logger.error(f"Unexpected error loading configuration {self.config_path}: {e}")
return {}
def get_config(self):
"""Returns the loaded configuration data."""
return self.config_data
def reload_config(self):
"""Reloads the configuration from the file."""
logger.info(f"Reloading configuration from: {self.config_path}")
self.config_data = self._load_config()
self.excluded_domains = self.config_data.get('excluded_domains', [])
if not isinstance(self.excluded_domains, list):
logger.warning(f"Excluded domains in reloaded config is not a list: {self.excluded_domains}. Ignoring.")
self.excluded_domains = []
logger.info("Configuration reloaded.")
def is_domain_ignored(self, domain: str) -> bool:
"""
Checks if a given domain matches any pattern in the excluded_domains list.
Supports exact matches and wildcard (*) matching using fnmatch.
"""
if not domain: # Ignore empty domains
return True
if not self.excluded_domains: # If list is empty, nothing is ignored
return False
# Normalize domain to lowercase for case-insensitive comparison
domain_lower = domain.lower()
for pattern in self.excluded_domains:
if not isinstance(pattern, str): # Skip non-string patterns
continue
# Normalize pattern to lowercase
pattern_lower = pattern.lower()
# Use fnmatch.fnmatch for wildcard support (*)
if fnmatch.fnmatch(domain_lower, pattern_lower):
# logger.debug(f"Domain '{domain}' ignored due to pattern '{pattern}'")
return True
return False
# --- Add methods to get specific config values safely ---
@property
def history_update_interval_seconds(self) -> int:
"""Gets the history update interval, defaulting to 300."""
return self.config_data.get('history_update_interval_seconds', 300)
@property
def markdown_update_interval_seconds(self) -> int:
"""Gets the markdown update interval, defaulting to 300."""
return self.config_data.get('markdown_update_interval_seconds', 300)
# Add other specific getters as needed
# Example:
# @property
# def some_other_setting(self) -> str:
# return self.config_data.get('some_other_setting', 'default_value')

View File

@@ -1,493 +1,293 @@
from fastapi import FastAPI, Depends, Query, WebSocket, WebSocketDisconnect, HTTPException from fastapi import FastAPI, Depends
from sqlalchemy.orm import Session from sqlalchemy.orm import Session
from datetime import datetime, timezone, timedelta from datetime import datetime, timezone
from typing import List, Optional from typing import Optional
import asyncio import asyncio
from fastapi import WebSocketDisconnect
from urllib.parse import urlparse from urllib.parse import urlparse
import pytz
from fastapi.middleware.cors import CORSMiddleware from fastapi.middleware.cors import CORSMiddleware
import iso8601
from bs4 import BeautifulSoup
from sqlalchemy import text
from sqlalchemy.sql import text
from .logging_config import setup_logger
from fastapi.templating import Jinja2Templates
from fastapi.staticfiles import StaticFiles from fastapi.staticfiles import StaticFiles
from fastapi import Request
import browser_history import browser_history
from crawl4ai import AsyncWebCrawler
# Local imports
from .logging_config import setup_logger
from .database import ( from .database import (
get_db, get_db,
HistoryEntry, HistoryEntry,
Bookmark,
get_last_processed_timestamp, get_last_processed_timestamp,
update_last_processed_timestamp, update_last_processed_timestamp,
create_tables, create_tables,
engine, engine,
recreate_fts_tables # recreate_fts_tables # Keep if needed, but often done manually or via migration tool
) )
from .scheduler import HistoryScheduler
from .page_info import PageInfo
from .page_reader import PageReader
from .config import Config from .config import Config
from sqlalchemy.ext.declarative import declarative_base
# Import Routers
from .routers import history, bookmarks, config as api_config, websocket, ui
logger = setup_logger(__name__) logger = setup_logger(__name__)
app = FastAPI() # --- Global Variables ---
scheduler = HistoryScheduler() # These are accessed by other modules (like websocket router)
config = Config() # Consider using app state or dependency injection for cleaner management if complexity grows
config_manager = Config() # Renamed to avoid conflict with router import
crawler: Optional[AsyncWebCrawler] = None
# Add CORS middleware to allow WebSocket connections # Import scheduler *after* crawler is defined
from .scheduler import HistoryScheduler
scheduler: Optional[HistoryScheduler] = None # Now initialize scheduler variable
# --- FastAPI App Initialization ---
app = FastAPI(title="Browser History Search API")
# Add CORS middleware
app.add_middleware( app.add_middleware(
CORSMiddleware, CORSMiddleware,
allow_origins=["*"], # In production, specify your domains allow_origins=["*"], # Adjust in production
allow_credentials=True, allow_credentials=True,
allow_methods=["*"], allow_methods=["*"],
allow_headers=["*"], allow_headers=["*"],
) )
templates = Jinja2Templates(directory="app/templates") # Mount static files and templates
app.mount("/static", StaticFiles(directory="app/static"), name="static") app.mount("/static", StaticFiles(directory="app/static"), name="static")
# Note: Templates are used within the ui router now, no need for global instance here unless used elsewhere
@app.on_event("startup") # --- Helper Function (Initial Sync) ---
async def startup_event():
logger.info("Starting application")
try:
# First create the base tables
logger.info("Creating base tables...")
create_tables()
# # Drop and recreate FTS tables
# logger.info("Recreating FTS tables...")
# with engine.connect() as conn:
# # First check if the main history table exists
# result = conn.execute(text(
# "SELECT name FROM sqlite_master WHERE type='table' AND name='history'"
# )).fetchone()
# if not result:
# logger.info("Main history table doesn't exist yet, creating tables...")
# Base.metadata.create_all(bind=engine)
# # Now recreate FTS tables
# logger.info("Dropping and recreating FTS tables...")
# recreate_fts_tables()
# logger.info("FTS tables recreation completed")
# Initial history and bookmark fetch
logger.info("Processing initial browser history...")
process_browser_history()
logger.info("Updating bookmarks...")
await scheduler.update_bookmarks()
# Start the background tasks
logger.info("Starting background tasks...")
asyncio.create_task(scheduler.update_history())
logger.info("Startup completed successfully")
except Exception as e:
logger.error(f"Error during startup: {str(e)}", exc_info=True)
raise
def serialize_history_entry(entry, include_content: bool = False):
"""Serialize a HistoryEntry object to a dictionary"""
# Handle both ORM objects and raw SQL results
if hasattr(entry, '_mapping'): # Raw SQL result
result = {
"id": entry.id,
"url": entry.url,
"title": entry.title,
"visit_time": entry.visit_time.isoformat() if isinstance(entry.visit_time, datetime) else entry.visit_time,
"domain": entry.domain,
}
else: # ORM object
result = {
"id": entry.id,
"url": entry.url,
"title": entry.title,
"visit_time": entry.visit_time.isoformat() if entry.visit_time else None,
"domain": entry.domain,
}
if include_content:
result["markdown_content"] = entry.markdown_content
return result
def serialize_bookmark(bookmark):
"""Serialize a Bookmark object to a dictionary"""
return {
"id": bookmark.id,
"url": bookmark.url,
"title": bookmark.title,
"added_time": bookmark.added_time.isoformat() if bookmark.added_time else None,
"folder": bookmark.folder,
"domain": bookmark.domain,
}
@app.get("/history/search")
async def search_history(
query: Optional[str] = Query(None),
domain: Optional[str] = Query(None),
start_date: Optional[str] = Query(None),
end_date: Optional[str] = Query(None),
include_content: bool = Query(False),
db: Session = Depends(get_db)
):
"""Search history using FTS5"""
try:
if query:
# Build the FTS query
fts_conditions = [f'title:{query}* OR markdown_content:{query}*']
params = {'query': query}
if domain:
fts_conditions.append(f'domain:"{domain}"')
fts_query = ' AND '.join(fts_conditions)
# Build the SQL query
sql = """
SELECT
h.*,
bm25(history_fts) as rank,
highlight(history_fts, 0, '<mark>', '</mark>') as title_highlight,
highlight(history_fts, 1, '<mark>', '</mark>') as content_highlight
FROM history_fts
JOIN history h ON history_fts.rowid = h.id
WHERE history_fts MATCH :fts_query
"""
# Add date filters if provided
if start_date:
sql += " AND h.visit_time >= :start_date"
params['start_date'] = start_date
if end_date:
sql += " AND h.visit_time <= :end_date"
params['end_date'] = end_date
sql += " ORDER BY rank, h.visit_time DESC LIMIT 100"
params['fts_query'] = fts_query
results = db.execute(text(sql), params).fetchall()
return [serialize_history_entry(row, include_content) for row in results]
else:
# Handle non-search queries
query = db.query(HistoryEntry)
if domain:
query = query.filter(HistoryEntry.domain == domain)
if start_date:
query = query.filter(HistoryEntry.visit_time >= start_date)
if end_date:
query = query.filter(HistoryEntry.visit_time <= end_date)
entries = query.order_by(HistoryEntry.visit_time.desc()).limit(100).all()
return [serialize_history_entry(entry, include_content) for entry in entries]
except Exception as e:
logger.error(f"Search error: {str(e)}", exc_info=True)
raise HTTPException(
status_code=500,
detail={"message": "Search operation failed", "error": str(e)}
)
@app.get("/bookmarks/search")
async def search_bookmarks(
domain: Optional[str] = Query(None),
folder: Optional[str] = Query(None),
search_term: Optional[str] = Query(None),
db: Session = Depends(get_db)
):
"""Search bookmarks with optimized queries"""
try:
# Build query efficiently
query = db.query(Bookmark)
# Apply filters using index-optimized queries
if domain:
query = query.filter(Bookmark.domain == domain)
if folder:
query = query.filter(Bookmark.folder == folder)
if search_term:
# Use LIKE with index hint for title search
search_pattern = f"%{search_term}%"
query = query.filter(
Bookmark.title.ilike(search_pattern)
).with_hint(
Bookmark,
'INDEXED BY ix_bookmarks_title',
'sqlite'
)
# Add ordering and limit for better performance
bookmarks = query.order_by(Bookmark.added_time.desc()).limit(1000).all()
return [serialize_bookmark(bookmark) for bookmark in bookmarks]
except Exception as e:
print(f"Bookmark search error: {e}")
raise HTTPException(status_code=500, detail="Search operation failed")
# Add new endpoint for advanced full-text search
@app.get("/history/search/advanced")
async def advanced_history_search(
query: str = Query(..., description="Full-text search query with SQLite FTS5 syntax"),
include_content: bool = Query(False),
db: Session = Depends(get_db)
):
"""Advanced full-text search using SQLite FTS5 features"""
try:
# Use raw SQL for advanced FTS query
fts_query = """
SELECT h.*, rank
FROM history h
INNER JOIN history_fts f ON h.id = f.rowid
WHERE history_fts MATCH :query
ORDER BY rank
LIMIT 1000
"""
results = db.execute(text(fts_query), {'query': query}).all()
# Convert results to HistoryEntry objects
entries = [
serialize_history_entry(
HistoryEntry(
id=row.id,
url=row.url,
title=row.title,
visit_time=row.visit_time,
domain=row.domain,
markdown_content=row.markdown_content if include_content else None
),
include_content
)
for row in results
]
return entries
except Exception as e:
print(f"Advanced search error: {e}")
raise HTTPException(status_code=500, detail="Advanced search operation failed")
@app.websocket("/ws")
async def websocket_endpoint(websocket: WebSocket, db: Session = Depends(get_db)):
logger.info("New WebSocket connection established")
page_reader = PageReader()
await websocket.accept()
try:
while True:
data = await websocket.receive_json()
# Parse the URL and check if domain should be ignored
domain = urlparse(data['url']).netloc
if config.is_domain_ignored(domain):
logger.info(f"Ignoring domain: {domain}")
await websocket.send_json({
"status": "ignored",
"message": f"Domain {domain} is in ignore list"
})
continue
logger.info(f"Processing page: {data['url']}")
timestamp = iso8601.parse_date(data['timestamp'])
# Check if we already have a recent entry for this URL
existing_entry = db.query(HistoryEntry).filter(
HistoryEntry.url == data['url'],
HistoryEntry.visit_time >= timestamp - timedelta(minutes=5)
).first()
if existing_entry:
print(f"Recent entry exists for URL: {data['url']}")
await websocket.send_json({
"status": "skipped",
"message": "Recent entry exists"
})
continue
page_info = PageInfo(
url=data['url'],
html=data['html'],
timestamp=timestamp
)
# Debug HTML content
print(f"HTML content length before processing: {len(page_info.html)}")
# Extract title
soup = BeautifulSoup(page_info.html, 'html.parser')
title = soup.title.string if soup.title else ''
print(f"Extracted title: {title}")
# Debug markdown conversion
print("Starting markdown conversion...")
cleaned_html = page_reader.clean_html(page_info.html)
print(f"Cleaned HTML length: {len(cleaned_html)}")
markdown_content = page_reader.html_to_markdown(page_info.html)
print(f"Markdown conversion complete. Content length: {len(markdown_content) if markdown_content else 0}")
if markdown_content:
print("First 100 chars of markdown:", markdown_content[:100])
else:
print("No markdown content generated")
if not title and not markdown_content:
print(f"No content extracted from: {page_info.url}")
await websocket.send_json({
"status": "skipped",
"message": "No content extracted"
})
continue
# Create history entry
history_entry = HistoryEntry(
url=page_info.url,
title=title,
visit_time=page_info.timestamp,
domain=domain,
markdown_content=markdown_content,
last_content_update=datetime.now(timezone.utc)
)
# Debug database operation
print(f"Saving entry with markdown length: {len(markdown_content) if markdown_content else 0}")
# Use bulk operations for better performance
db.add(history_entry)
try:
db.commit()
print(f"Successfully saved entry for: {page_info.url}")
print(f"Verify markdown content length in database: {len(history_entry.markdown_content) if history_entry.markdown_content else 0}")
await websocket.send_json({
"status": "success",
"message": f"Processed page: {page_info.url}"
})
except Exception as e:
db.rollback()
print(f"Error saving entry: {e}")
await websocket.send_json({
"status": "error",
"message": "Database error"
})
except WebSocketDisconnect:
logger.info("Client disconnected")
except Exception as e:
logger.error("Error in WebSocket handler", exc_info=True)
finally:
await page_reader.close()
@app.get("/config/ignored-domains")
async def get_ignored_domains():
"""Get list of ignored domain patterns"""
return {"ignored_domains": config.config.get('ignored_domains', [])}
@app.post("/config/ignored-domains")
async def add_ignored_domain(pattern: str):
"""Add a new domain pattern to ignored list"""
config.add_ignored_domain(pattern)
return {"status": "success", "message": f"Added pattern: {pattern}"}
@app.delete("/config/ignored-domains/{pattern}")
async def remove_ignored_domain(pattern: str):
"""Remove a domain pattern from ignored list"""
config.remove_ignored_domain(pattern)
return {"status": "success", "message": f"Removed pattern: {pattern}"}
@app.get("/")
async def home(request: Request, db: Session = Depends(get_db)):
# Get recent history entries
entries = db.query(HistoryEntry)\
.order_by(HistoryEntry.visit_time.desc())\
.limit(50)\
.all()
return templates.TemplateResponse(
"index.html",
{"request": request, "entries": entries}
)
@app.get("/search")
async def search_page(request: Request):
return templates.TemplateResponse(
"search.html",
{"request": request}
)
@app.get("/bookmarks")
async def bookmarks_page(request: Request, db: Session = Depends(get_db)):
bookmarks = db.query(Bookmark)\
.order_by(Bookmark.added_time.desc())\
.limit(50)\
.all()
return templates.TemplateResponse(
"bookmarks.html",
{"request": request, "bookmarks": bookmarks}
)
def process_browser_history(): def process_browser_history():
"""Fetches and stores new history entries from browser_history library (Initial Sync)."""
try: try:
logger.info("Starting browser history processing") logger.info("Starting browser history processing (initial sync)")
outputs = browser_history.get_history() outputs = browser_history.get_history()
history_list = outputs.histories # This is a list of tuples (timestamp, url, title) # browser_history returns platform specific History object, get histories list
logger.info(f"Found {len(history_list)} total history items") history_list = []
if hasattr(outputs, 'histories') and outputs.histories:
history_list = outputs.histories # List of (datetime, url, title)
else:
logger.warning("Could not retrieve histories list from browser_history output.")
return # Exit if no history list found
current_timestamp = int(datetime.now().timestamp()) logger.info(f"Found {len(history_list)} total history items from browser_history library")
source_key = "browser_history" # Single source since we get combined history
last_timestamp = get_last_processed_timestamp(source_key)
logger.info(f"Last processed timestamp: {last_timestamp}") current_timestamp_dt = datetime.now(timezone.utc)
current_timestamp = int(current_timestamp_dt.timestamp()) # Use timezone-aware timestamp
source_key = "browser_history_sync" # Differentiate from scheduler source
last_timestamp = get_last_processed_timestamp(source_key) or 0 # Ensure it's 0 if None
# Filter for only new entries logger.info(f"Last processed timestamp for initial sync '{source_key}': {last_timestamp}")
new_entries = [
entry for entry in history_list
if entry[0].timestamp() > last_timestamp
]
logger.info(f"Found {len(new_entries)} new entries") new_entries = []
processed_urls_times = set() # Avoid duplicates within the batch
for entry in history_list:
# Basic validation of entry structure
if not isinstance(entry, (tuple, list)) or len(entry) < 2:
logger.warning(f"Skipping malformed history entry: {entry}")
continue
timestamp, url = entry[0], entry[1]
title = entry[2] if len(entry) > 2 else "" # Handle optional title
if not url or not timestamp:
logger.warning(f"Skipping entry with missing URL or timestamp: Title='{title}'")
continue
# Ensure timestamp is datetime object
if not isinstance(timestamp, datetime):
logger.warning(f"Skipping entry with non-datetime timestamp ({type(timestamp)}): {url}")
continue
# Normalize timestamp (Assume local if naive, convert to UTC)
if timestamp.tzinfo is None or timestamp.tzinfo.utcoffset(timestamp) is None:
try:
timestamp_aware = timestamp.astimezone() # Make aware using system local
except Exception as tz_err:
logger.warning(f"Could not determine local timezone for naive timestamp {timestamp}. Assuming UTC. Error: {tz_err}")
timestamp_aware = timestamp.replace(tzinfo=timezone.utc) # Fallback to UTC
else:
timestamp_aware = timestamp
timestamp_utc = timestamp_aware.astimezone(timezone.utc)
# Filter for only new entries based on normalized UTC timestamp
if timestamp_utc.timestamp() > last_timestamp:
entry_key = (url, timestamp_utc.timestamp())
if entry_key in processed_urls_times:
continue # Skip duplicate within this batch
new_entries.append((timestamp_utc, url, title))
processed_urls_times.add(entry_key)
logger.info(f"Found {len(new_entries)} new entries for initial sync after filtering")
if new_entries: if new_entries:
for timestamp, url, title in new_entries: added_count = 0
logger.info(f"Processing entry: {timestamp} - {url}") skipped_ignored = 0
domain = urlparse(url).netloc # Use context manager for session
if config.is_domain_ignored(domain): with next(get_db()) as db:
logger.debug(f"Skipping ignored domain: {domain}")
continue
# Create history entry
db = next(get_db())
try: try:
history_entry = HistoryEntry( for timestamp_utc, url, title in new_entries:
url=url, domain = urlparse(url).netloc
title=title, if config_manager.is_domain_ignored(domain):
visit_time=timestamp, # logger.debug(f"Skipping ignored domain during initial sync: {domain}")
domain=domain skipped_ignored += 1
) continue
db.add(history_entry)
db.commit() # Optional: Check if entry already exists more robustly
# existing = db.query(HistoryEntry.id).filter(HistoryEntry.url == url, HistoryEntry.visit_time == timestamp_utc).first()
# if existing:
# continue
history_entry = HistoryEntry(
url=url,
title=title or "", # Ensure title is not None
visit_time=timestamp_utc,
domain=domain
# Note: No markdown content here, only basic history
)
db.add(history_entry)
added_count += 1
if added_count > 0:
db.commit()
logger.info(f"Committed {added_count} new history entries from initial sync.")
# Update the last processed timestamp only if successful commit
update_last_processed_timestamp(source_key, current_timestamp)
logger.info(f"Updated initial sync timestamp for '{source_key}' to {current_timestamp}")
else:
logger.info("No new unique entries to commit during initial sync.")
# Update timestamp even if nothing new added, to mark sync time
update_last_processed_timestamp(source_key, current_timestamp)
logger.info(f"Updated initial sync timestamp check for '{source_key}' to {current_timestamp}")
if skipped_ignored > 0:
logger.info(f"Skipped {skipped_ignored} entries due to ignored domains during initial sync.")
except Exception as e: except Exception as e:
logger.error(f"Error storing history item: {str(e)}") logger.error(f"Error storing history item during initial sync: {str(e)}", exc_info=True)
db.rollback() db.rollback()
finally: else:
db.close() logger.info("No new history entries found during initial sync.")
# Update timestamp even if nothing new found, to mark sync time
update_last_processed_timestamp(source_key, current_timestamp)
logger.info(f"Updated initial sync timestamp check for '{source_key}' to {current_timestamp}")
# Update the last processed timestamp
update_last_processed_timestamp(source_key, current_timestamp)
logger.info(f"Updated timestamp to {current_timestamp}")
logger.info(f"Processed {len(new_entries)} new items") except ImportError:
logger.warning("`browser_history` library not found or import failed. Skipping initial sync.")
except Exception as e:
logger.error(f"Error processing browser history during initial sync: {str(e)}", exc_info=True)
# --- Startup and Shutdown Events ---
@app.on_event("startup")
async def startup_event():
global crawler, scheduler # Allow modification of globals
logger.info("Starting application initialization...")
try:
# 1. Ensure base tables exist
logger.info("Ensuring base tables exist...")
create_tables()
# 2. Initialize the crawler
logger.info("Initializing AsyncWebCrawler...")
if crawler is None:
crawler = AsyncWebCrawler()
logger.info("AsyncWebCrawler initialized.")
# 3. Initialize the Scheduler *after* the crawler
logger.info("Initializing HistoryScheduler...")
if scheduler is None:
scheduler = HistoryScheduler(crawler=crawler) # Pass crawler instance
logger.info("HistoryScheduler initialized.")
# 4. Perform initial history sync from browser_history library
logger.info("Performing initial browser history sync...")
process_browser_history() # Sync history not processed before
# 5. Perform initial bookmark sync (using scheduler's method)
# Run in background to avoid blocking startup if it takes long
logger.info("Starting initial bookmark sync task...")
asyncio.create_task(scheduler.update_bookmarks())
# 6. Start background tasks (scheduler for ongoing updates)
logger.info("Starting background history update task...")
asyncio.create_task(scheduler.update_history())
# --- Markdown Update Tasks ---
# 7a. Trigger ONE initial batch processing run in the background
logger.info("Starting initial markdown processing batch task...")
asyncio.create_task(scheduler._process_markdown_batch()) # Run one batch now
# 7b. Start the PERIODIC background markdown update task
logger.info("Starting periodic background markdown update task...")
# Use the renamed method for the loop
asyncio.create_task(scheduler.update_missing_markdown_periodically())
# --- End Markdown Update Tasks ---
logger.info("Application startup sequence initiated. Background tasks running.")
except Exception as e: except Exception as e:
logger.error(f"Error processing browser history: {str(e)}", exc_info=True) logger.error(f"FATAL ERROR during application startup: {str(e)}", exc_info=True)
raise RuntimeError(f"Application startup failed: {e}") from e
@app.on_event("shutdown")
async def shutdown_event():
global crawler, scheduler
logger.info("Starting application shutdown...")
# Stop scheduler tasks gracefully if possible (implement cancellation in tasks if needed)
# For now, we just close resources
# Close scheduler resources
if scheduler and hasattr(scheduler, 'close'):
try:
logger.info("Closing scheduler resources...")
await scheduler.close() # Call the scheduler's close method
except Exception as e:
logger.error(f"Error closing scheduler: {e}", exc_info=True)
# Close crawler if needed (check crawl4ai docs for explicit close method)
# Based on previous code, seems no explicit close needed, but keep check just in case
if crawler and hasattr(crawler, 'aclose'):
try:
logger.info("Closing AsyncWebCrawler...")
# await crawler.aclose() # Example if an async close exists
except Exception as e:
logger.error(f"Error closing crawler: {e}", exc_info=True)
# Close database engine connections if necessary (usually handled automatically by SQLAlchemy)
# if engine and hasattr(engine, 'dispose'): # Check if using async engine that needs dispose
# await engine.dispose()
logger.info("Application shutdown complete.")
# --- Include Routers ---
app.include_router(history.router)
app.include_router(bookmarks.router)
app.include_router(api_config.router)
app.include_router(websocket.router)
app.include_router(ui.router)
# Optional: Add a root endpoint for health check or basic info
@app.get("/health", tags=["service"])
async def health_check():
# Extended health check could verify DB connection or task status
db_ok = False
try:
with next(get_db()) as db:
db.execute("SELECT 1")
db_ok = True
except Exception:
db_ok = False
return {
"status": "ok",
"database_connection": "ok" if db_ok else "error",
# Add other checks as needed
}

View File

@@ -1,117 +0,0 @@
import re
from markdownify import markdownify as md
from bs4 import BeautifulSoup
from typing import Optional
from urllib.parse import urlparse
from .config import ReaderConfig
from .logging_config import setup_logger
from .database import SessionLocal
# Setup logger for this module
logger = setup_logger(__name__)
# Patterns for cleaning
SCRIPT_PATTERN = r"<[ ]*script.*?\/[ ]*script[ ]*>"
STYLE_PATTERN = r"<[ ]*style.*?\/[ ]*style[ ]*>"
META_PATTERN = r"<[ ]*meta.*?>"
COMMENT_PATTERN = r"<[ ]*!--.*?--[ ]*>"
LINK_PATTERN = r"<[ ]*link.*?>"
BASE64_IMG_PATTERN = r'<img[^>]+src="data:image/[^;]+;base64,[^"]+"[^>]*>'
SVG_PATTERN = r"(<svg[^>]*>)(.*?)(<\/svg>)"
class PageReader:
def __init__(self):
self.config = ReaderConfig()
logger.info("PageReader initialized")
def clean_html(self, html: str) -> str:
"""Clean HTML by removing unwanted elements and patterns."""
if not html:
logger.warning("Received empty HTML to clean")
return ""
logger.debug(f"Cleaning HTML of length: {len(html)}")
# First use regex to remove problematic patterns
html = re.sub(SCRIPT_PATTERN, "", html, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL)
html = re.sub(STYLE_PATTERN, "", html, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL)
html = re.sub(META_PATTERN, "", html, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL)
html = re.sub(COMMENT_PATTERN, "", html, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL)
html = re.sub(LINK_PATTERN, "", html, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL)
html = re.sub(SVG_PATTERN, "", html, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL)
html = re.sub(BASE64_IMG_PATTERN, "", html)
try:
# Use BeautifulSoup to remove additional elements we want to strip
soup = BeautifulSoup(html, 'html.parser')
# Remove unwanted elements
elements_to_remove = [
'canvas', 'img', 'picture', 'audio', 'video',
'iframe', 'embed', 'object', 'param', 'track',
'map', 'area', 'source'
]
for element in elements_to_remove:
removed = len(soup.find_all(element))
if removed:
logger.debug(f"Removed {removed} {element} elements")
for tag in soup.find_all(element):
tag.decompose()
return str(soup)
except Exception as e:
logger.error(f"Error cleaning HTML: {e}", exc_info=True)
return ""
def clean_whitespace(self, text: str) -> str:
"""Clean excessive whitespace from text."""
if not text:
return ""
try:
# Replace 3 or more newlines with 2 newlines
cleaned = re.sub(r'\n{3,}', '\n\n', text)
# Remove trailing whitespace from each line
cleaned = '\n'.join(line.rstrip() for line in cleaned.splitlines())
return cleaned.strip()
except Exception as e:
logger.error(f"Error cleaning whitespace: {e}")
return ""
def html_to_markdown(self, html: str) -> Optional[str]:
"""Convert HTML to markdown."""
try:
logger.info("Starting HTML to Markdown conversion")
logger.debug(f"Input HTML length: {len(html)}")
cleaned_html = self.clean_html(html)
logger.debug(f"Cleaned HTML length: {len(cleaned_html)}")
if not cleaned_html:
logger.warning("No cleaned HTML content")
return None
markdown = self.clean_whitespace(md(cleaned_html,
heading_style="ATX",
bullets="-",
autolinks=True,
strip=['form'],
escape_asterisks=True,
escape_underscores=True))
logger.debug(f"Generated markdown length: {len(markdown) if markdown else 0}")
if not markdown or markdown.isspace():
logger.warning("Markdown is empty or whitespace only")
return None
return markdown
except Exception as e:
logger.error("Error converting to markdown", exc_info=True)
return None
async def close(self):
"""Cleanup resources"""
logger.info("Closing PageReader")
pass # No need to close DB connection anymore

47
app/routers/bookmarks.py Normal file
View File

@@ -0,0 +1,47 @@
from fastapi import APIRouter, Depends, Query, HTTPException
from sqlalchemy.orm import Session
from typing import List, Optional
from ..database import get_db, Bookmark
from ..utils import serialize_bookmark
from ..logging_config import setup_logger
logger = setup_logger(__name__)
router = APIRouter(prefix="/bookmarks", tags=["bookmarks"])
@router.get("/search")
async def search_bookmarks(
domain: Optional[str] = Query(None),
folder: Optional[str] = Query(None),
search_term: Optional[str] = Query(None),
db: Session = Depends(get_db)
):
"""Search bookmarks with optimized queries"""
try:
# Build query efficiently
query = db.query(Bookmark)
# Apply filters using index-optimized queries
if domain:
query = query.filter(Bookmark.domain == domain)
if folder:
query = query.filter(Bookmark.folder == folder)
if search_term:
# Use LIKE for title search (consider FTS for bookmarks if needed)
search_pattern = f"%{search_term}%"
query = query.filter(Bookmark.title.ilike(search_pattern))
# Removed index hint as SQLAlchemy/SQLite usually handles this well with LIKE
# Add ordering and limit for better performance
bookmarks = query.order_by(Bookmark.added_time.desc()).limit(1000).all()
return [serialize_bookmark(bookmark) for bookmark in bookmarks]
except Exception as e:
logger.error(f"Bookmark search error: {e}", exc_info=True)
raise HTTPException(
status_code=500,
detail={"message": "Bookmark search operation failed", "error": str(e)}
)

43
app/routers/config.py Normal file
View File

@@ -0,0 +1,43 @@
from fastapi import APIRouter, Depends, HTTPException
from typing import List
from ..config import Config
from ..logging_config import setup_logger
logger = setup_logger(__name__)
router = APIRouter(prefix="/config", tags=["config"])
# Assuming config is a singleton or easily accessible
# If not, you might need to use Depends or app state
config = Config()
@router.get("/ignored-domains")
async def get_ignored_domains():
"""Get list of ignored domain patterns"""
try:
return {"ignored_domains": config.config.get('ignored_domains', [])}
except Exception as e:
logger.error(f"Error getting ignored domains: {e}", exc_info=True)
raise HTTPException(status_code=500, detail="Failed to retrieve ignored domains")
@router.post("/ignored-domains")
async def add_ignored_domain(pattern: str):
"""Add a new domain pattern to ignored list"""
try:
config.add_ignored_domain(pattern)
return {"status": "success", "message": f"Added pattern: {pattern}"}
except Exception as e:
logger.error(f"Error adding ignored domain '{pattern}': {e}", exc_info=True)
raise HTTPException(status_code=500, detail="Failed to add ignored domain")
@router.delete("/ignored-domains/{pattern}")
async def remove_ignored_domain(pattern: str):
"""Remove a domain pattern from ignored list"""
try:
config.remove_ignored_domain(pattern)
return {"status": "success", "message": f"Removed pattern: {pattern}"}
except Exception as e:
logger.error(f"Error removing ignored domain '{pattern}': {e}", exc_info=True)
raise HTTPException(status_code=500, detail="Failed to remove ignored domain")

132
app/routers/history.py Normal file
View File

@@ -0,0 +1,132 @@
from fastapi import APIRouter, Depends, Query, HTTPException
from sqlalchemy.orm import Session
from sqlalchemy import text
from typing import List, Optional
from ..database import get_db, HistoryEntry
from ..utils import serialize_history_entry
from ..logging_config import setup_logger
logger = setup_logger(__name__)
router = APIRouter(prefix="/history", tags=["history"])
@router.get("/search")
async def search_history(
query: Optional[str] = Query(None),
domain: Optional[str] = Query(None),
start_date: Optional[str] = Query(None),
end_date: Optional[str] = Query(None),
include_content: bool = Query(False),
db: Session = Depends(get_db)
):
"""Search history using FTS5"""
try:
if query:
# Build the FTS query
# Basic query sanitization/escaping might be needed depending on FTS syntax usage
# For simple term search, this is okay. For complex FTS syntax, more care is needed.
fts_conditions = []
params = {}
# Handle different query parts (title, content, domain)
# Example: "term1 title:term2 domain:example.com"
# This requires more sophisticated parsing. For now, assume simple query applies to title/content.
# A safer approach for user input:
sanitized_query = query.replace('"', '""') # Basic FTS escaping for quotes
fts_match_expr = f'(title : "{sanitized_query}"* OR markdown_content : "{sanitized_query}"*)'
params['fts_query'] = fts_match_expr
if domain:
# Add domain filtering directly in FTS if possible and indexed
# Assuming 'domain' is an indexed column in FTS table
# params['fts_query'] += f' AND domain : "{domain}"' # Adjust FTS syntax if needed
# Or filter after FTS search if domain isn't in FTS index efficiently
pass # Domain filtering will be added later if needed
# Build the SQL query
sql = """
SELECT
h.*,
bm25(history_fts) as rank,
highlight(history_fts, 0, '<mark>', '</mark>') as title_highlight,
highlight(history_fts, 1, '<mark>', '</mark>') as content_highlight
FROM history_fts
JOIN history h ON history_fts.rowid = h.id
WHERE history_fts MATCH :fts_query
"""
# Add domain filter as a regular WHERE clause if not in FTS MATCH
if domain:
sql += " AND h.domain = :domain"
params['domain'] = domain
# Add date filters if provided
if start_date:
sql += " AND h.visit_time >= :start_date"
params['start_date'] = start_date
if end_date:
sql += " AND h.visit_time <= :end_date"
params['end_date'] = end_date
sql += " ORDER BY rank DESC, h.visit_time DESC LIMIT 100" # Rank usually descends
results = db.execute(text(sql), params).fetchall()
# Use the updated serializer that handles potential highlight/rank fields
return [serialize_history_entry(row, include_content) for row in results]
else:
# Handle non-search queries (basic filtering)
query_builder = db.query(HistoryEntry)
if domain:
query_builder = query_builder.filter(HistoryEntry.domain == domain)
if start_date:
query_builder = query_builder.filter(HistoryEntry.visit_time >= start_date)
if end_date:
query_builder = query_builder.filter(HistoryEntry.visit_time <= end_date)
entries = query_builder.order_by(HistoryEntry.visit_time.desc()).limit(100).all()
return [serialize_history_entry(entry, include_content) for entry in entries]
except Exception as e:
logger.error(f"Search error: {str(e)}", exc_info=True)
raise HTTPException(
status_code=500,
detail={"message": "Search operation failed", "error": str(e)}
)
@router.get("/search/advanced")
async def advanced_history_search(
query: str = Query(..., description="Full-text search query with SQLite FTS5 syntax"),
include_content: bool = Query(False),
db: Session = Depends(get_db)
):
"""Advanced full-text search using SQLite FTS5 features"""
try:
# Use raw SQL for advanced FTS query
# Add rank and highlights here as well
fts_query = """
SELECT
h.*,
bm25(history_fts) as rank,
highlight(history_fts, 0, '<mark>', '</mark>') as title_highlight,
highlight(history_fts, 1, '<mark>', '</mark>') as content_highlight
FROM history_fts
JOIN history h ON history_fts.rowid = h.id
WHERE history_fts MATCH :query
ORDER BY rank DESC, h.visit_time DESC
LIMIT 1000
"""
results = db.execute(text(fts_query), {'query': query}).fetchall()
# Use the updated serializer
return [serialize_history_entry(row, include_content) for row in results]
except Exception as e:
logger.error(f"Advanced search error: {e}", exc_info=True)
raise HTTPException(
status_code=500,
detail={"message": "Advanced search operation failed", "error": str(e)}
)

52
app/routers/ui.py Normal file
View File

@@ -0,0 +1,52 @@
from fastapi import APIRouter, Depends, Request
from fastapi.templating import Jinja2Templates
from sqlalchemy.orm import Session
from ..database import get_db, HistoryEntry, Bookmark
from ..logging_config import setup_logger
logger = setup_logger(__name__)
router = APIRouter(tags=["ui"])
templates = Jinja2Templates(directory="app/templates")
@router.get("/")
async def home(request: Request, db: Session = Depends(get_db)):
try:
# Get recent history entries
entries = db.query(HistoryEntry)\
.order_by(HistoryEntry.visit_time.desc())\
.limit(50)\
.all()
return templates.TemplateResponse(
"index.html",
{"request": request, "entries": entries}
)
except Exception as e:
logger.error(f"Error loading home page: {e}", exc_info=True)
# Optionally return an error template
return templates.TemplateResponse("error.html", {"request": request, "detail": "Could not load history"})
@router.get("/search")
async def search_page(request: Request):
return templates.TemplateResponse(
"search.html",
{"request": request}
)
@router.get("/bookmarks")
async def bookmarks_page(request: Request, db: Session = Depends(get_db)):
try:
bookmarks = db.query(Bookmark)\
.order_by(Bookmark.added_time.desc())\
.limit(50)\
.all()
return templates.TemplateResponse(
"bookmarks.html",
{"request": request, "bookmarks": bookmarks}
)
except Exception as e:
logger.error(f"Error loading bookmarks page: {e}", exc_info=True)
# Optionally return an error template
return templates.TemplateResponse("error.html", {"request": request, "detail": "Could not load bookmarks"})

175
app/routers/websocket.py Normal file
View File

@@ -0,0 +1,175 @@
import asyncio
from fastapi import APIRouter, WebSocket, WebSocketDisconnect, Depends, HTTPException
from sqlalchemy.orm import Session
from datetime import datetime, timezone, timedelta
from urllib.parse import urlparse
import iso8601
# Import necessary components from other modules
from .. import main as app_main # To access global crawler instance
from ..database import get_db, HistoryEntry
from ..config import Config
from ..logging_config import setup_logger
logger = setup_logger(__name__)
router = APIRouter(tags=["websocket"])
config = Config() # Assuming config is okay as a separate instance here
@router.websocket("/ws")
async def websocket_endpoint(websocket: WebSocket, db: Session = Depends(get_db)):
# Access the global crawler instance from main.py
crawler = app_main.crawler
if not crawler:
logger.error("Crawler not initialized!")
await websocket.close(code=1011) # Internal Server Error
return
logger.info("New WebSocket connection established")
await websocket.accept()
try:
while True:
data = await websocket.receive_json()
# Validate incoming data structure (basic check)
if 'url' not in data or 'timestamp' not in data:
logger.warning("Received invalid WebSocket message format.")
await websocket.send_json({
"status": "error",
"message": "Invalid message format. 'url' and 'timestamp' required."
})
continue
url = data['url']
try:
timestamp = iso8601.parse_date(data['timestamp'])
except iso8601.ParseError:
logger.warning(f"Received invalid timestamp format: {data['timestamp']}")
await websocket.send_json({
"status": "error",
"message": f"Invalid timestamp format: {data['timestamp']}"
})
continue
# Parse the URL and check if domain should be ignored
try:
domain = urlparse(url).netloc
if not domain: # Handle invalid URLs
raise ValueError("Could not parse domain from URL")
except ValueError as e:
logger.warning(f"Could not parse URL: {url}. Error: {e}")
await websocket.send_json({"status": "error", "message": f"Invalid URL: {url}"})
continue
if config.is_domain_ignored(domain):
logger.info(f"Ignoring domain: {domain} for URL: {url}")
await websocket.send_json({
"status": "ignored",
"message": f"Domain {domain} is in ignore list"
})
continue
logger.info(f"Processing page via WebSocket: {url}")
# Check if we already have a recent entry for this URL
# Make timestamp timezone-aware (assuming UTC if naive)
if timestamp.tzinfo is None:
timestamp = timestamp.replace(tzinfo=timezone.utc)
else:
timestamp = timestamp.astimezone(timezone.utc)
recent_threshold = timestamp - timedelta(minutes=5)
existing_entry = db.query(HistoryEntry.id).filter(
HistoryEntry.url == url,
HistoryEntry.visit_time >= recent_threshold
).first() # Only fetch ID for efficiency
if existing_entry:
logger.info(f"Recent entry exists for URL: {url}")
await websocket.send_json({
"status": "skipped",
"message": "Recent entry exists"
})
continue
# --- Start crawl4ai processing ---
logger.info(f"Processing page with crawl4ai: {url}")
markdown_content = None
title = ''
try:
# Use the global crawler instance
crawl_result = await crawler.arun(url=url)
if crawl_result:
markdown_content = crawl_result.markdown
# Attempt to get title from metadata, fallback to empty string
title = getattr(crawl_result.metadata, 'title', '') or '' # Ensure title is string
if not title:
logger.warning(f"Could not extract title for {url} using crawl4ai.")
logger.info(f"crawl4ai processing complete. Markdown length: {len(markdown_content) if markdown_content else 0}, Title: '{title}'")
else:
logger.warning(f"crawl4ai returned None for URL: {url}")
markdown_content = "" # Ensure it's not None
title = ""
except Exception as crawl_error:
logger.error(f"crawl4ai failed for URL {url}: {crawl_error}", exc_info=True)
await websocket.send_json({
"status": "error",
"message": f"Failed to crawl page content: {str(crawl_error)}"
})
continue # Skip to next message
# --- End crawl4ai processing ---
# Only proceed if we got some content or at least a title
if not title and not markdown_content:
logger.info(f"No title or content extracted by crawl4ai from: {url}")
await websocket.send_json({
"status": "skipped",
"message": "No title or content extracted by crawl4ai"
})
continue
# Create history entry using data from crawl4ai
history_entry = HistoryEntry(
url=url,
title=title, # Use title from crawl4ai
visit_time=timestamp, # Use the parsed, timezone-aware timestamp
domain=domain,
markdown_content=markdown_content, # Use markdown from crawl4ai
last_content_update=datetime.now(timezone.utc)
)
logger.debug(f"Attempting to save entry for {url} with markdown length: {len(markdown_content) if markdown_content else 0}")
db.add(history_entry)
try:
db.commit()
logger.info(f"Successfully saved entry for: {url}")
await websocket.send_json({
"status": "success",
"message": f"Processed page: {url}"
})
except Exception as e:
db.rollback()
logger.error(f"Error saving entry for {url}: {e}", exc_info=True)
await websocket.send_json({
"status": "error",
"message": "Database error occurred while saving."
})
except WebSocketDisconnect:
logger.info("WebSocket client disconnected")
except Exception as e:
logger.error(f"Unhandled error in WebSocket handler: {e}", exc_info=True)
# Attempt to inform client before closing (might fail if connection is already broken)
try:
await websocket.send_json({
"status": "error",
"message": "An internal server error occurred."
})
except Exception:
pass # Ignore if sending fails
# Ensure connection is closed on server error
try:
await websocket.close(code=1011) # Internal Server Error
except Exception:
pass # Ignore if closing fails

View File

@@ -1,142 +1,386 @@
from fastapi import BackgroundTasks from datetime import datetime, timedelta, timezone
from datetime import datetime, timedelta
import asyncio import asyncio
from .database import SessionLocal, HistoryEntry, Bookmark, get_last_processed_timestamp, update_last_processed_timestamp from sqlalchemy import or_, update
from .database import HistoryEntry, Bookmark, get_last_processed_timestamp, update_last_processed_timestamp
from .browser import BrowserHistoryCollector from .browser import BrowserHistoryCollector
from .page_reader import PageReader
from sqlalchemy import func
from sqlalchemy.orm import Session
import pytz
from .config import Config from .config import Config
from .database import get_db from .database import get_db
from urllib.parse import urlparse import urllib.parse
import logging import logging
from crawl4ai import AsyncWebCrawler
from typing import Optional
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
class HistoryScheduler: class HistoryScheduler:
def __init__(self): def __init__(self, crawler: AsyncWebCrawler):
self.browser_collector = BrowserHistoryCollector() self.browser_collector = BrowserHistoryCollector()
self.page_reader = PageReader()
self.last_history_update = None self.last_history_update = None
self.content_update_interval = timedelta(hours=24) # Update content daily self.content_update_interval = timedelta(hours=24) # Update content daily
self.config = Config() self.config = Config()
self.db_lock = asyncio.Lock() self.db_lock = asyncio.Lock()
self.crawler = crawler
def _normalize_datetime(self, dt: datetime) -> datetime: def _normalize_datetime(self, dt: datetime) -> Optional[datetime]:
"""Convert datetime to UTC if it has timezone, or make it timezone-aware if it doesn't""" """Convert datetime to UTC if it has timezone, or make it timezone-aware (UTC) if it doesn't"""
if dt is None: if dt is None:
return None return None
# If datetime is naive (no timezone), assume it's in UTC # If datetime is naive (no timezone), assume it's local and convert to UTC
if dt.tzinfo is None: if dt.tzinfo is None or dt.tzinfo.utcoffset(dt) is None:
return pytz.UTC.localize(dt) # Assume local timezone if naive, then convert to UTC
# This might need adjustment based on where the naive datetime originates
# If browser_history always returns naive UTC, use: dt.replace(tzinfo=timezone.utc)
# If browser_history returns naive local time:
dt = dt.astimezone() # Make timezone-aware using system's local timezone
return dt.astimezone(timezone.utc) # Convert to UTC
# If datetime has timezone, convert to UTC # If datetime already has timezone, convert to UTC
return dt.astimezone(pytz.UTC) return dt.astimezone(timezone.utc)
async def update_bookmarks(self): async def update_bookmarks(self):
"""Update bookmarks from browsers""" """Update bookmarks from browsers"""
try: try:
current_timestamp = int(datetime.now().timestamp()) # Use timezone-aware current time
current_timestamp_dt = datetime.now(timezone.utc)
current_timestamp = int(current_timestamp_dt.timestamp())
source_key = "browser_bookmarks" source_key = "browser_bookmarks"
last_timestamp = get_last_processed_timestamp(source_key) # Ensure last_timestamp is 0 if None
last_timestamp = get_last_processed_timestamp(source_key) or 0
logger.info(f"Fetching bookmarks. Last processed timestamp: {last_timestamp}") logger.info(f"Fetching bookmarks. Last processed timestamp (UTC epoch): {last_timestamp}")
bookmarks = self.browser_collector.fetch_bookmarks() bookmarks = self.browser_collector.fetch_bookmarks()
logger.info(f"Found {len(bookmarks)} total bookmarks") logger.info(f"Found {len(bookmarks)} total bookmarks")
# Filter for only new bookmarks new_bookmarks = []
new_bookmarks = [ skipped_ignored = 0
(added_time, url, title, folder) for added_time, url, title, folder in bookmarks processed_urls = set() # Avoid processing duplicate bookmark URLs within the same batch
if self._normalize_datetime(added_time).timestamp() > last_timestamp
] for added_time, url, title, folder in bookmarks:
if not url or url in processed_urls: # Skip empty or duplicate URLs in this batch
continue
# Normalize timestamp *before* comparison
normalized_added_time = self._normalize_datetime(added_time)
if normalized_added_time is None:
logger.warning(f"Skipping bookmark with invalid timestamp: {url} - {title}")
continue
# Compare timestamps after normalization
if normalized_added_time.timestamp() > last_timestamp:
domain = urllib.parse.urlparse(url).netloc
if self.config.is_domain_ignored(domain):
# logger.debug(f"Skipping ignored domain for bookmark: {domain}")
skipped_ignored += 1
continue
new_bookmarks.append((normalized_added_time, url, title, folder, domain))
processed_urls.add(url) # Mark URL as processed for this batch
logger.info(f"Found {len(new_bookmarks)} new bookmarks to process after filtering.")
if skipped_ignored > 0:
logger.info(f"Skipped {skipped_ignored} bookmarks due to ignored domains.")
logger.info(f"Found {len(new_bookmarks)} new bookmarks to process")
if new_bookmarks: if new_bookmarks:
async with self.db_lock: async with self.db_lock:
# Use context manager for session
with next(get_db()) as db: with next(get_db()) as db:
added_count = 0 added_count = 0
for added_time, url, title, folder in new_bookmarks: try:
domain = urlparse(url).netloc for norm_added_time, url, title, folder, domain in new_bookmarks:
if self.config.is_domain_ignored(domain): # Optional: Check if bookmark already exists (by URL)
logger.debug(f"Skipping ignored domain: {domain}") # existing = db.query(Bookmark.id).filter(Bookmark.url == url).first()
continue # if existing:
# logger.debug(f"Bookmark already exists: {url}")
# continue
added_time = self._normalize_datetime(added_time) bookmark = Bookmark(
url=url,
title=title or "", # Ensure title is not None
added_time=norm_added_time,
folder=folder or "", # Ensure folder is not None
domain=domain
)
db.add(bookmark)
added_count += 1
bookmark = Bookmark( if added_count > 0:
url=url, db.commit()
title=title, logger.info(f"Successfully committed {added_count} new bookmarks.")
added_time=added_time, # Update timestamp only if new bookmarks were added
folder=folder, update_last_processed_timestamp(source_key, current_timestamp)
domain=domain logger.info(f"Updated last processed bookmark timestamp for '{source_key}' to {current_timestamp}")
) else:
db.add(bookmark) logger.info("No new unique bookmarks to add in this batch.")
added_count += 1 # Optionally update timestamp even if no *new* bookmarks were added,
# to signify the check was performed up to 'current_timestamp'.
# update_last_processed_timestamp(source_key, current_timestamp)
# logger.info(f"Updated last processed bookmark timestamp check for '{source_key}' to {current_timestamp}")
db.commit()
logger.info(f"Successfully added {added_count} new bookmarks")
except Exception as e:
logger.error(f"Error committing bookmarks: {str(e)}", exc_info=True)
db.rollback()
else:
logger.info("No new bookmarks found since last check.")
# Update timestamp to indicate the check was performed
update_last_processed_timestamp(source_key, current_timestamp) update_last_processed_timestamp(source_key, current_timestamp)
logger.info(f"Updated last processed timestamp to {current_timestamp}") logger.info(f"Updated last processed bookmark timestamp check for '{source_key}' to {current_timestamp}")
except Exception as e: except Exception as e:
logger.error(f"Error updating bookmarks: {str(e)}", exc_info=True) logger.error(f"Error updating bookmarks: {str(e)}", exc_info=True)
async def update_history(self): async def update_history(self):
"""Background task to update history periodically""" """Background task to update history periodically"""
# Initial sleep to allow startup tasks (like initial sync) to potentially finish first
await asyncio.sleep(10)
while True: while True:
try: try:
current_timestamp = int(datetime.now().timestamp()) # Use timezone-aware current time
source_key = "browser_history" current_timestamp_dt = datetime.now(timezone.utc)
last_timestamp = get_last_processed_timestamp(source_key) current_timestamp = int(current_timestamp_dt.timestamp())
source_key = "browser_history_scheduler" # Use a different key than initial sync
# Ensure last_timestamp is 0 if None
last_timestamp = get_last_processed_timestamp(source_key) or 0
logger.info(f"Fetching history. Last processed timestamp: {last_timestamp}") logger.info(f"Scheduler: Fetching history. Last processed timestamp (UTC epoch): {last_timestamp}")
history_entries = self.browser_collector.fetch_history() history_entries = self.browser_collector.fetch_history()
logger.info(f"Found {len(history_entries)} total history entries") logger.info(f"Scheduler: Found {len(history_entries)} total history entries from browser.")
# Filter for only new entries new_entries = []
new_entries = [ skipped_ignored = 0
(visit_time, url, title) for visit_time, url, title in history_entries processed_urls_times = set() # Avoid duplicates within the batch (url, timestamp)
if self._normalize_datetime(visit_time).timestamp() > last_timestamp
]
logger.info(f"Found {len(new_entries)} new history entries to process") for visit_time, url, title in history_entries:
# Basic validation
if not url or not visit_time:
logger.warning(f"Scheduler: Skipping entry with missing URL or timestamp: {title}")
continue
# Normalize timestamp *before* comparison
normalized_visit_time = self._normalize_datetime(visit_time)
if normalized_visit_time is None:
logger.warning(f"Scheduler: Skipping history with invalid timestamp: {url} - {title}")
continue
# Compare timestamps after normalization
if normalized_visit_time.timestamp() > last_timestamp:
entry_key = (url, normalized_visit_time.timestamp())
if entry_key in processed_urls_times:
continue # Skip duplicate within this batch
domain = urllib.parse.urlparse(url).netloc
if self.config.is_domain_ignored(domain):
# logger.debug(f"Scheduler: Skipping ignored domain: {domain}")
skipped_ignored += 1
continue
new_entries.append((normalized_visit_time, url, title, domain))
processed_urls_times.add(entry_key)
logger.info(f"Scheduler: Found {len(new_entries)} new history entries to process after filtering.")
if skipped_ignored > 0:
logger.info(f"Scheduler: Skipped {skipped_ignored} history entries due to ignored domains.")
if new_entries: if new_entries:
async with self.db_lock: async with self.db_lock:
# Use context manager for session
with next(get_db()) as db: with next(get_db()) as db:
added_count = 0 added_count = 0
for visit_time, url, title in new_entries: try:
domain = urlparse(url).netloc for norm_visit_time, url, title, domain in new_entries:
if self.config.is_domain_ignored(domain): # Optional: More robust check if entry already exists
logger.debug(f"Skipping ignored domain: {domain}") # existing = db.query(HistoryEntry.id).filter(
continue # HistoryEntry.url == url,
# HistoryEntry.visit_time == norm_visit_time
# ).first()
# if existing:
# logger.debug(f"Scheduler: History entry already exists: {url} at {norm_visit_time}")
# continue
visit_time = self._normalize_datetime(visit_time) history_entry = HistoryEntry(
url=url,
title=title or "", # Ensure title is not None
visit_time=norm_visit_time,
domain=domain
# markdown_content is initially NULL
)
db.add(history_entry)
added_count += 1
history_entry = HistoryEntry( if added_count > 0:
url=url, db.commit()
title=title, logger.info(f"Scheduler: Successfully committed {added_count} new history entries.")
visit_time=visit_time, # Update timestamp only if new entries were added
domain=domain update_last_processed_timestamp(source_key, current_timestamp)
) logger.info(f"Scheduler: Updated last processed history timestamp for '{source_key}' to {current_timestamp}")
db.add(history_entry) else:
added_count += 1 logger.info("Scheduler: No new unique history entries to add in this batch.")
# Optionally update timestamp even if no *new* entries were added
db.commit() # update_last_processed_timestamp(source_key, current_timestamp)
logger.info(f"Successfully added {added_count} new history entries") # logger.info(f"Scheduler: Updated last processed history timestamp check for '{source_key}' to {current_timestamp}")
except Exception as e:
logger.error(f"Scheduler: Error committing history: {str(e)}", exc_info=True)
db.rollback()
else:
logger.info("Scheduler: No new history entries found since last check.")
# Update timestamp to indicate the check was performed
update_last_processed_timestamp(source_key, current_timestamp) update_last_processed_timestamp(source_key, current_timestamp)
logger.info(f"Updated last processed timestamp to {current_timestamp}") logger.info(f"Scheduler: Updated last processed history timestamp check for '{source_key}' to {current_timestamp}")
except Exception as e: except Exception as e:
logger.error(f"Error updating history: {str(e)}", exc_info=True) logger.error(f"Scheduler: Error in update_history loop: {str(e)}", exc_info=True)
await asyncio.sleep(300) # Wait 5 minutes before next update # --- Access config value using property ---
try:
# Use direct attribute access via the @property
wait_time = self.config.history_update_interval_seconds
except Exception as config_err:
logger.error(f"Scheduler (History): Error accessing config for wait time, using default 300s. Error: {config_err}")
wait_time = 300
# --- End Access ---
logger.debug(f"Scheduler (History): Sleeping for {wait_time} seconds.")
await asyncio.sleep(wait_time) # Use the obtained wait_time
async def _process_markdown_batch(self):
"""Fetches and processes one batch (up to 10) of history entries needing markdown."""
entries_to_process = []
try:
# --- Query for entries (inside DB lock/session) ---
async with self.db_lock:
with next(get_db()) as db:
# Find up to 10 entries where markdown_content is NULL or empty string
entries_to_process = db.query(HistoryEntry).filter(
or_(HistoryEntry.markdown_content == None, HistoryEntry.markdown_content == '')
).order_by(HistoryEntry.visit_time.asc()).limit(10).all()
if entries_to_process:
logger.info(f"Markdown Processor: Found {len(entries_to_process)} entries to process in this batch.")
for entry in entries_to_process:
db.expunge(entry) # Detach before async operations
else:
logger.info("Markdown Processor: No history entries found needing markdown update in this batch.")
return # Nothing to do in this batch
# --- Crawling and Updating (outside the DB lock/session) ---
processed_count = 0
skipped_ignored = 0
for entry in entries_to_process:
markdown_content = None
crawl_success = False
should_update_db = False
# --- ADD DOMAIN CHECK ---
try:
# +++ Add Debugging Lines +++
logger.debug(f"Debugging urllib.parse type: {type(urllib.parse)}")
logger.debug(f"Is 'urlparse' in urllib.parse? {'urlparse' in dir(urllib.parse)}")
# +++ End Debugging Lines +++
domain = urllib.parse.urlparse(entry.url).netloc
if self.config.is_domain_ignored(domain):
logger.debug(f"Markdown Processor: Skipping ignored domain: {domain} for URL: {entry.url} (ID={entry.id})")
skipped_ignored += 1
continue
except Exception as parse_err:
logger.warning(f"Markdown Processor: Error parsing URL to get domain: {entry.url} (ID={entry.id}). Type={type(parse_err).__name__} Error: {parse_err}. Skipping entry.")
continue
# --- END DOMAIN CHECK ---
try:
logger.info(f"Markdown Processor: Crawling URL: {entry.url} (ID={entry.id})")
if not self.crawler:
logger.error("Markdown Processor: Crawler not initialized!")
break # Stop processing this batch if crawler is missing
result = await self.crawler.arun(url=entry.url)
if result and result.markdown:
markdown_content = result.markdown
crawl_success = True
logger.info(f"Markdown Processor: Successfully crawled and got markdown for ID={entry.id}.")
else:
logger.warning(f"Markdown Processor: Crawling completed but no markdown content found for ID={entry.id}, URL={entry.url}")
markdown_content = "" # Mark as processed without content
crawl_success = True
should_update_db = True
except Exception as crawl_error:
logger.error(f"Markdown Processor: Error crawling URL {entry.url} (ID={entry.id}) Type={type(crawl_error).__name__}: {crawl_error}", exc_info=False)
should_update_db = False # Don't update DB on crawl error
# --- Update DB for this specific entry ---
if should_update_db:
try:
async with self.db_lock:
with next(get_db()) as db_update:
stmt = (
update(HistoryEntry)
.where(HistoryEntry.id == entry.id)
.values(markdown_content=markdown_content)
)
result_proxy = db_update.execute(stmt)
if result_proxy.rowcount > 0:
db_update.commit()
# Adjust log message based on whether it was skipped or processed
if markdown_content == "" and crawl_success and not result.markdown: # Check if marked empty due to no content
logger.info(f"Markdown Processor: Marked entry as processed (no content found) for ID={entry.id}.")
elif crawl_success:
logger.info(f"Markdown Processor: Successfully updated markdown status for ID={entry.id}.")
# Only increment processed_count if actual content was added or marked empty after crawl
if markdown_content is not None: # Includes actual markdown or empty string marker
processed_count += 1
else:
logger.warning(f"Markdown Processor: Could not find entry ID={entry.id} to update markdown status (rowcount 0).")
db_update.rollback()
except Exception as db_update_error:
logger.error(f"Markdown Processor: Error updating database for ID={entry.id}: {db_update_error}", exc_info=True)
log_suffix = f"Updated {processed_count}"
if skipped_ignored > 0:
log_suffix += f", Skipped {skipped_ignored} (ignored domain)"
log_suffix += f" out of {len(entries_to_process)} entries in this batch."
logger.info(f"Markdown Processor: Finished processing batch. {log_suffix}")
except Exception as e:
logger.error(f"Markdown Processor: Error processing markdown batch: {str(e)}", exc_info=True)
async def update_missing_markdown_periodically(self):
"""Periodically triggers the processing of batches of history entries needing markdown."""
# Initial slight delay to ensure startup tasks settle
await asyncio.sleep(15)
logger.info("Starting periodic markdown update task...")
while True:
await self._process_markdown_batch() # Process one batch
# Wait before checking for the next batch
# --- Access config value using property ---
try:
# Use direct attribute access via the @property
wait_time = self.config.markdown_update_interval_seconds
except Exception as config_err:
logger.error(f"Periodic Markdown Updater: Error accessing config for wait time, using default 300s. Error: {config_err}")
wait_time = 300
# --- End Access ---
logger.debug(f"Periodic Markdown Updater: Sleeping for {wait_time} seconds before next batch.")
await asyncio.sleep(wait_time)
async def close(self): async def close(self):
"""Cleanup resources""" """Cleanup resources"""
await self.page_reader.close() logger.info("Closing scheduler resources...")
# Add any specific cleanup needed for BrowserHistoryCollector if necessary
# The crawler is managed and closed (if needed) in main.py's shutdown
pass

45
app/utils.py Normal file
View File

@@ -0,0 +1,45 @@
from datetime import datetime
from .database import HistoryEntry, Bookmark
def serialize_history_entry(entry, include_content: bool = False):
"""Serialize a HistoryEntry object or raw SQL result to a dictionary"""
# Handle both ORM objects and raw SQL results
if hasattr(entry, '_mapping'): # Raw SQL result (from execute)
result = {
"id": entry.id,
"url": entry.url,
"title": entry.title,
"visit_time": entry.visit_time.isoformat() if isinstance(entry.visit_time, datetime) else entry.visit_time,
"domain": entry.domain,
# Add potential highlight fields if they exist
"title_highlight": getattr(entry, 'title_highlight', None),
"content_highlight": getattr(entry, 'content_highlight', None),
"rank": getattr(entry, 'rank', None)
}
if include_content:
# Ensure markdown_content exists before accessing
result["markdown_content"] = getattr(entry, 'markdown_content', None)
else: # ORM object (from query)
result = {
"id": entry.id,
"url": entry.url,
"title": entry.title,
"visit_time": entry.visit_time.isoformat() if entry.visit_time else None,
"domain": entry.domain,
}
if include_content:
result["markdown_content"] = entry.markdown_content
return result
def serialize_bookmark(bookmark):
"""Serialize a Bookmark object to a dictionary"""
return {
"id": bookmark.id,
"url": bookmark.url,
"title": bookmark.title,
"added_time": bookmark.added_time.isoformat() if bookmark.added_time else None,
"folder": bookmark.folder,
"domain": bookmark.domain,
}

View File

@@ -4,13 +4,19 @@ excluded_domains:
- localhost - localhost
- 127.0.0.1 - 127.0.0.1
# IP ranges # Specific Domains / Subdomains
- ap.www.namecheap.com # Ignore this specific subdomain
- www.namecheap.com # Ignore the main domain (will cover /twofa/* path implicitly)
- login.linode.com # Ignore the login subdomain
# IP ranges (requires wildcard matching in config.py)
- 192.168.*.* - 192.168.*.*
- 10.*.*.* - 10.*.*.*
- 172.16.*.* - 172.16.*.*
- "0.0.0.*" - 0.0.0.* # Note: Be careful with overly broad patterns
# Example wildcard patterns
# Example wildcard patterns (requires wildcard matching in config.py)
# - *.local # - *.local
# - reddit-*.com
# - *.githubusercontent.com # - *.githubusercontent.com
# - *.google.com # Example: Ignore all google subdomains

20
pyproject.toml Normal file
View File

@@ -0,0 +1,20 @@
[project]
name = "browser-recall"
version = "0.1.0"
description = "Add your description here"
readme = "README.md"
requires-python = ">=3.10.16"
dependencies = [
"crawl4ai",
"fastapi",
"sqlalchemy",
"uvicorn",
"pytz",
"aiofiles",
"websockets",
"pyyaml",
"browser-history",
"pydantic",
"pydantic-settings",
"iso8601",
]

2343
uv.lock generated Normal file

File diff suppressed because it is too large Load Diff