mirror of
https://github.com/Zetaphor/browser-recall.git
synced 2025-12-06 10:29:38 +00:00
493 lines
17 KiB
Python
493 lines
17 KiB
Python
from fastapi import FastAPI, Depends, Query, WebSocket, WebSocketDisconnect, HTTPException
|
|
from sqlalchemy.orm import Session
|
|
from datetime import datetime, timezone, timedelta
|
|
from typing import List, Optional
|
|
import asyncio
|
|
from fastapi import WebSocketDisconnect
|
|
from urllib.parse import urlparse
|
|
import pytz
|
|
from fastapi.middleware.cors import CORSMiddleware
|
|
import iso8601
|
|
from bs4 import BeautifulSoup
|
|
from sqlalchemy import text
|
|
from sqlalchemy.sql import text
|
|
from .logging_config import setup_logger
|
|
from fastapi.templating import Jinja2Templates
|
|
from fastapi.staticfiles import StaticFiles
|
|
from fastapi import Request
|
|
import browser_history
|
|
from .database import (
|
|
get_db,
|
|
HistoryEntry,
|
|
Bookmark,
|
|
get_last_processed_timestamp,
|
|
update_last_processed_timestamp,
|
|
create_tables,
|
|
engine,
|
|
recreate_fts_tables
|
|
)
|
|
from .scheduler import HistoryScheduler
|
|
from .page_info import PageInfo
|
|
from .page_reader import PageReader
|
|
from .config import Config
|
|
from sqlalchemy.ext.declarative import declarative_base
|
|
|
|
logger = setup_logger(__name__)
|
|
|
|
app = FastAPI()
|
|
scheduler = HistoryScheduler()
|
|
config = Config()
|
|
|
|
# Add CORS middleware to allow WebSocket connections
|
|
app.add_middleware(
|
|
CORSMiddleware,
|
|
allow_origins=["*"], # In production, specify your domains
|
|
allow_credentials=True,
|
|
allow_methods=["*"],
|
|
allow_headers=["*"],
|
|
)
|
|
|
|
templates = Jinja2Templates(directory="app/templates")
|
|
app.mount("/static", StaticFiles(directory="app/static"), name="static")
|
|
|
|
@app.on_event("startup")
|
|
async def startup_event():
|
|
logger.info("Starting application")
|
|
|
|
try:
|
|
# First create the base tables
|
|
logger.info("Creating base tables...")
|
|
create_tables()
|
|
|
|
# # Drop and recreate FTS tables
|
|
# logger.info("Recreating FTS tables...")
|
|
# with engine.connect() as conn:
|
|
# # First check if the main history table exists
|
|
# result = conn.execute(text(
|
|
# "SELECT name FROM sqlite_master WHERE type='table' AND name='history'"
|
|
# )).fetchone()
|
|
|
|
# if not result:
|
|
# logger.info("Main history table doesn't exist yet, creating tables...")
|
|
# Base.metadata.create_all(bind=engine)
|
|
|
|
# # Now recreate FTS tables
|
|
# logger.info("Dropping and recreating FTS tables...")
|
|
# recreate_fts_tables()
|
|
|
|
# logger.info("FTS tables recreation completed")
|
|
|
|
# Initial history and bookmark fetch
|
|
logger.info("Processing initial browser history...")
|
|
process_browser_history()
|
|
|
|
logger.info("Updating bookmarks...")
|
|
await scheduler.update_bookmarks()
|
|
|
|
# Start the background tasks
|
|
logger.info("Starting background tasks...")
|
|
asyncio.create_task(scheduler.update_history())
|
|
|
|
logger.info("Startup completed successfully")
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error during startup: {str(e)}", exc_info=True)
|
|
raise
|
|
|
|
def serialize_history_entry(entry, include_content: bool = False):
|
|
"""Serialize a HistoryEntry object to a dictionary"""
|
|
# Handle both ORM objects and raw SQL results
|
|
if hasattr(entry, '_mapping'): # Raw SQL result
|
|
result = {
|
|
"id": entry.id,
|
|
"url": entry.url,
|
|
"title": entry.title,
|
|
"visit_time": entry.visit_time.isoformat() if isinstance(entry.visit_time, datetime) else entry.visit_time,
|
|
"domain": entry.domain,
|
|
}
|
|
else: # ORM object
|
|
result = {
|
|
"id": entry.id,
|
|
"url": entry.url,
|
|
"title": entry.title,
|
|
"visit_time": entry.visit_time.isoformat() if entry.visit_time else None,
|
|
"domain": entry.domain,
|
|
}
|
|
|
|
if include_content:
|
|
result["markdown_content"] = entry.markdown_content
|
|
return result
|
|
|
|
def serialize_bookmark(bookmark):
|
|
"""Serialize a Bookmark object to a dictionary"""
|
|
return {
|
|
"id": bookmark.id,
|
|
"url": bookmark.url,
|
|
"title": bookmark.title,
|
|
"added_time": bookmark.added_time.isoformat() if bookmark.added_time else None,
|
|
"folder": bookmark.folder,
|
|
"domain": bookmark.domain,
|
|
}
|
|
|
|
@app.get("/history/search")
|
|
async def search_history(
|
|
query: Optional[str] = Query(None),
|
|
domain: Optional[str] = Query(None),
|
|
start_date: Optional[str] = Query(None),
|
|
end_date: Optional[str] = Query(None),
|
|
include_content: bool = Query(False),
|
|
db: Session = Depends(get_db)
|
|
):
|
|
"""Search history using FTS5"""
|
|
try:
|
|
if query:
|
|
# Build the FTS query
|
|
fts_conditions = [f'title:{query}* OR markdown_content:{query}*']
|
|
params = {'query': query}
|
|
|
|
if domain:
|
|
fts_conditions.append(f'domain:"{domain}"')
|
|
|
|
fts_query = ' AND '.join(fts_conditions)
|
|
|
|
# Build the SQL query
|
|
sql = """
|
|
SELECT
|
|
h.*,
|
|
bm25(history_fts) as rank,
|
|
highlight(history_fts, 0, '<mark>', '</mark>') as title_highlight,
|
|
highlight(history_fts, 1, '<mark>', '</mark>') as content_highlight
|
|
FROM history_fts
|
|
JOIN history h ON history_fts.rowid = h.id
|
|
WHERE history_fts MATCH :fts_query
|
|
"""
|
|
|
|
# Add date filters if provided
|
|
if start_date:
|
|
sql += " AND h.visit_time >= :start_date"
|
|
params['start_date'] = start_date
|
|
if end_date:
|
|
sql += " AND h.visit_time <= :end_date"
|
|
params['end_date'] = end_date
|
|
|
|
sql += " ORDER BY rank, h.visit_time DESC LIMIT 100"
|
|
|
|
params['fts_query'] = fts_query
|
|
|
|
results = db.execute(text(sql), params).fetchall()
|
|
return [serialize_history_entry(row, include_content) for row in results]
|
|
|
|
else:
|
|
# Handle non-search queries
|
|
query = db.query(HistoryEntry)
|
|
|
|
if domain:
|
|
query = query.filter(HistoryEntry.domain == domain)
|
|
if start_date:
|
|
query = query.filter(HistoryEntry.visit_time >= start_date)
|
|
if end_date:
|
|
query = query.filter(HistoryEntry.visit_time <= end_date)
|
|
|
|
entries = query.order_by(HistoryEntry.visit_time.desc()).limit(100).all()
|
|
return [serialize_history_entry(entry, include_content) for entry in entries]
|
|
|
|
except Exception as e:
|
|
logger.error(f"Search error: {str(e)}", exc_info=True)
|
|
raise HTTPException(
|
|
status_code=500,
|
|
detail={"message": "Search operation failed", "error": str(e)}
|
|
)
|
|
|
|
@app.get("/bookmarks/search")
|
|
async def search_bookmarks(
|
|
domain: Optional[str] = Query(None),
|
|
folder: Optional[str] = Query(None),
|
|
search_term: Optional[str] = Query(None),
|
|
db: Session = Depends(get_db)
|
|
):
|
|
"""Search bookmarks with optimized queries"""
|
|
try:
|
|
# Build query efficiently
|
|
query = db.query(Bookmark)
|
|
|
|
# Apply filters using index-optimized queries
|
|
if domain:
|
|
query = query.filter(Bookmark.domain == domain)
|
|
|
|
if folder:
|
|
query = query.filter(Bookmark.folder == folder)
|
|
|
|
if search_term:
|
|
# Use LIKE with index hint for title search
|
|
search_pattern = f"%{search_term}%"
|
|
query = query.filter(
|
|
Bookmark.title.ilike(search_pattern)
|
|
).with_hint(
|
|
Bookmark,
|
|
'INDEXED BY ix_bookmarks_title',
|
|
'sqlite'
|
|
)
|
|
|
|
# Add ordering and limit for better performance
|
|
bookmarks = query.order_by(Bookmark.added_time.desc()).limit(1000).all()
|
|
|
|
return [serialize_bookmark(bookmark) for bookmark in bookmarks]
|
|
|
|
except Exception as e:
|
|
print(f"Bookmark search error: {e}")
|
|
raise HTTPException(status_code=500, detail="Search operation failed")
|
|
|
|
# Add new endpoint for advanced full-text search
|
|
@app.get("/history/search/advanced")
|
|
async def advanced_history_search(
|
|
query: str = Query(..., description="Full-text search query with SQLite FTS5 syntax"),
|
|
include_content: bool = Query(False),
|
|
db: Session = Depends(get_db)
|
|
):
|
|
"""Advanced full-text search using SQLite FTS5 features"""
|
|
try:
|
|
# Use raw SQL for advanced FTS query
|
|
fts_query = """
|
|
SELECT h.*, rank
|
|
FROM history h
|
|
INNER JOIN history_fts f ON h.id = f.rowid
|
|
WHERE history_fts MATCH :query
|
|
ORDER BY rank
|
|
LIMIT 1000
|
|
"""
|
|
|
|
results = db.execute(text(fts_query), {'query': query}).all()
|
|
|
|
# Convert results to HistoryEntry objects
|
|
entries = [
|
|
serialize_history_entry(
|
|
HistoryEntry(
|
|
id=row.id,
|
|
url=row.url,
|
|
title=row.title,
|
|
visit_time=row.visit_time,
|
|
domain=row.domain,
|
|
markdown_content=row.markdown_content if include_content else None
|
|
),
|
|
include_content
|
|
)
|
|
for row in results
|
|
]
|
|
|
|
return entries
|
|
|
|
except Exception as e:
|
|
print(f"Advanced search error: {e}")
|
|
raise HTTPException(status_code=500, detail="Advanced search operation failed")
|
|
|
|
@app.websocket("/ws")
|
|
async def websocket_endpoint(websocket: WebSocket, db: Session = Depends(get_db)):
|
|
logger.info("New WebSocket connection established")
|
|
page_reader = PageReader()
|
|
await websocket.accept()
|
|
try:
|
|
while True:
|
|
data = await websocket.receive_json()
|
|
|
|
# Parse the URL and check if domain should be ignored
|
|
domain = urlparse(data['url']).netloc
|
|
if config.is_domain_ignored(domain):
|
|
logger.info(f"Ignoring domain: {domain}")
|
|
await websocket.send_json({
|
|
"status": "ignored",
|
|
"message": f"Domain {domain} is in ignore list"
|
|
})
|
|
continue
|
|
|
|
logger.info(f"Processing page: {data['url']}")
|
|
timestamp = iso8601.parse_date(data['timestamp'])
|
|
|
|
# Check if we already have a recent entry for this URL
|
|
existing_entry = db.query(HistoryEntry).filter(
|
|
HistoryEntry.url == data['url'],
|
|
HistoryEntry.visit_time >= timestamp - timedelta(minutes=5)
|
|
).first()
|
|
|
|
if existing_entry:
|
|
print(f"Recent entry exists for URL: {data['url']}")
|
|
await websocket.send_json({
|
|
"status": "skipped",
|
|
"message": "Recent entry exists"
|
|
})
|
|
continue
|
|
|
|
page_info = PageInfo(
|
|
url=data['url'],
|
|
html=data['html'],
|
|
timestamp=timestamp
|
|
)
|
|
|
|
# Debug HTML content
|
|
print(f"HTML content length before processing: {len(page_info.html)}")
|
|
|
|
# Extract title
|
|
soup = BeautifulSoup(page_info.html, 'html.parser')
|
|
title = soup.title.string if soup.title else ''
|
|
print(f"Extracted title: {title}")
|
|
|
|
# Debug markdown conversion
|
|
print("Starting markdown conversion...")
|
|
cleaned_html = page_reader.clean_html(page_info.html)
|
|
print(f"Cleaned HTML length: {len(cleaned_html)}")
|
|
|
|
markdown_content = page_reader.html_to_markdown(page_info.html)
|
|
print(f"Markdown conversion complete. Content length: {len(markdown_content) if markdown_content else 0}")
|
|
|
|
if markdown_content:
|
|
print("First 100 chars of markdown:", markdown_content[:100])
|
|
else:
|
|
print("No markdown content generated")
|
|
|
|
if not title and not markdown_content:
|
|
print(f"No content extracted from: {page_info.url}")
|
|
await websocket.send_json({
|
|
"status": "skipped",
|
|
"message": "No content extracted"
|
|
})
|
|
continue
|
|
|
|
# Create history entry
|
|
history_entry = HistoryEntry(
|
|
url=page_info.url,
|
|
title=title,
|
|
visit_time=page_info.timestamp,
|
|
domain=domain,
|
|
markdown_content=markdown_content,
|
|
last_content_update=datetime.now(timezone.utc)
|
|
)
|
|
|
|
# Debug database operation
|
|
print(f"Saving entry with markdown length: {len(markdown_content) if markdown_content else 0}")
|
|
|
|
# Use bulk operations for better performance
|
|
db.add(history_entry)
|
|
|
|
try:
|
|
db.commit()
|
|
print(f"Successfully saved entry for: {page_info.url}")
|
|
print(f"Verify markdown content length in database: {len(history_entry.markdown_content) if history_entry.markdown_content else 0}")
|
|
await websocket.send_json({
|
|
"status": "success",
|
|
"message": f"Processed page: {page_info.url}"
|
|
})
|
|
except Exception as e:
|
|
db.rollback()
|
|
print(f"Error saving entry: {e}")
|
|
await websocket.send_json({
|
|
"status": "error",
|
|
"message": "Database error"
|
|
})
|
|
|
|
except WebSocketDisconnect:
|
|
logger.info("Client disconnected")
|
|
except Exception as e:
|
|
logger.error("Error in WebSocket handler", exc_info=True)
|
|
finally:
|
|
await page_reader.close()
|
|
|
|
@app.get("/config/ignored-domains")
|
|
async def get_ignored_domains():
|
|
"""Get list of ignored domain patterns"""
|
|
return {"ignored_domains": config.config.get('ignored_domains', [])}
|
|
|
|
@app.post("/config/ignored-domains")
|
|
async def add_ignored_domain(pattern: str):
|
|
"""Add a new domain pattern to ignored list"""
|
|
config.add_ignored_domain(pattern)
|
|
return {"status": "success", "message": f"Added pattern: {pattern}"}
|
|
|
|
@app.delete("/config/ignored-domains/{pattern}")
|
|
async def remove_ignored_domain(pattern: str):
|
|
"""Remove a domain pattern from ignored list"""
|
|
config.remove_ignored_domain(pattern)
|
|
return {"status": "success", "message": f"Removed pattern: {pattern}"}
|
|
|
|
@app.get("/")
|
|
async def home(request: Request, db: Session = Depends(get_db)):
|
|
# Get recent history entries
|
|
entries = db.query(HistoryEntry)\
|
|
.order_by(HistoryEntry.visit_time.desc())\
|
|
.limit(50)\
|
|
.all()
|
|
return templates.TemplateResponse(
|
|
"index.html",
|
|
{"request": request, "entries": entries}
|
|
)
|
|
|
|
@app.get("/search")
|
|
async def search_page(request: Request):
|
|
return templates.TemplateResponse(
|
|
"search.html",
|
|
{"request": request}
|
|
)
|
|
|
|
@app.get("/bookmarks")
|
|
async def bookmarks_page(request: Request, db: Session = Depends(get_db)):
|
|
bookmarks = db.query(Bookmark)\
|
|
.order_by(Bookmark.added_time.desc())\
|
|
.limit(50)\
|
|
.all()
|
|
return templates.TemplateResponse(
|
|
"bookmarks.html",
|
|
{"request": request, "bookmarks": bookmarks}
|
|
)
|
|
|
|
def process_browser_history():
|
|
try:
|
|
logger.info("Starting browser history processing")
|
|
outputs = browser_history.get_history()
|
|
history_list = outputs.histories # This is a list of tuples (timestamp, url, title)
|
|
logger.info(f"Found {len(history_list)} total history items")
|
|
|
|
current_timestamp = int(datetime.now().timestamp())
|
|
source_key = "browser_history" # Single source since we get combined history
|
|
last_timestamp = get_last_processed_timestamp(source_key)
|
|
|
|
logger.info(f"Last processed timestamp: {last_timestamp}")
|
|
|
|
# Filter for only new entries
|
|
new_entries = [
|
|
entry for entry in history_list
|
|
if entry[0].timestamp() > last_timestamp
|
|
]
|
|
|
|
logger.info(f"Found {len(new_entries)} new entries")
|
|
|
|
if new_entries:
|
|
for timestamp, url, title in new_entries:
|
|
logger.info(f"Processing entry: {timestamp} - {url}")
|
|
domain = urlparse(url).netloc
|
|
if config.is_domain_ignored(domain):
|
|
logger.debug(f"Skipping ignored domain: {domain}")
|
|
continue
|
|
|
|
# Create history entry
|
|
db = next(get_db())
|
|
try:
|
|
history_entry = HistoryEntry(
|
|
url=url,
|
|
title=title,
|
|
visit_time=timestamp,
|
|
domain=domain
|
|
)
|
|
db.add(history_entry)
|
|
db.commit()
|
|
except Exception as e:
|
|
logger.error(f"Error storing history item: {str(e)}")
|
|
db.rollback()
|
|
finally:
|
|
db.close()
|
|
|
|
# Update the last processed timestamp
|
|
update_last_processed_timestamp(source_key, current_timestamp)
|
|
logger.info(f"Updated timestamp to {current_timestamp}")
|
|
|
|
logger.info(f"Processed {len(new_entries)} new items")
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error processing browser history: {str(e)}", exc_info=True) |