Update FTS query

This commit is contained in:
2025-01-29 20:28:32 -06:00
parent 20cfdf75e4
commit 80516440d7
2 changed files with 162 additions and 75 deletions

View File

@@ -95,38 +95,40 @@ def init_fts():
conn = engine.raw_connection() conn = engine.raw_connection()
cursor = conn.cursor() cursor = conn.cursor()
# Update FTS table configuration to use trigrams for better partial matching # Create FTS table with content and title columns
cursor.execute(""" cursor.execute("""
CREATE VIRTUAL TABLE IF NOT EXISTS history_fts USING fts5( CREATE VIRTUAL TABLE IF NOT EXISTS history_fts USING fts5(
title, title,
markdown_content, markdown_content,
domain, -- Add domain for filtering
visit_time UNINDEXED, -- Add visit_time but don't index it
content='history', content='history',
content_rowid='id', content_rowid='id',
tokenize='trigram' tokenize='trigram'
) )
""") """)
# Create triggers to keep FTS index up to date # Update triggers to include domain and visit_time
cursor.execute(""" cursor.execute("""
CREATE TRIGGER IF NOT EXISTS history_ai AFTER INSERT ON history BEGIN CREATE TRIGGER IF NOT EXISTS history_ai AFTER INSERT ON history BEGIN
INSERT INTO history_fts(rowid, title, markdown_content) INSERT INTO history_fts(rowid, title, markdown_content, domain, visit_time)
VALUES (new.id, new.title, new.markdown_content); VALUES (new.id, new.title, new.markdown_content, new.domain, new.visit_time);
END; END;
""") """)
cursor.execute(""" cursor.execute("""
CREATE TRIGGER IF NOT EXISTS history_ad AFTER DELETE ON history BEGIN CREATE TRIGGER IF NOT EXISTS history_ad AFTER DELETE ON history BEGIN
INSERT INTO history_fts(history_fts, rowid, title, markdown_content) INSERT INTO history_fts(history_fts, rowid, title, markdown_content, domain, visit_time)
VALUES('delete', old.id, old.title, old.markdown_content); VALUES('delete', old.id, old.title, old.markdown_content, old.domain, old.visit_time);
END; END;
""") """)
cursor.execute(""" cursor.execute("""
CREATE TRIGGER IF NOT EXISTS history_au AFTER UPDATE ON history BEGIN CREATE TRIGGER IF NOT EXISTS history_au AFTER UPDATE ON history BEGIN
INSERT INTO history_fts(history_fts, rowid, title, markdown_content) INSERT INTO history_fts(history_fts, rowid, title, markdown_content, domain, visit_time)
VALUES('delete', old.id, old.title, old.markdown_content); VALUES('delete', old.id, old.title, old.markdown_content, old.domain, old.visit_time);
INSERT INTO history_fts(rowid, title, markdown_content) INSERT INTO history_fts(rowid, title, markdown_content, domain, visit_time)
VALUES (new.id, new.title, new.markdown_content); VALUES (new.id, new.title, new.markdown_content, new.domain, new.visit_time);
END; END;
""") """)
@@ -198,4 +200,82 @@ def create_tables():
) )
db.commit() db.commit()
finally: finally:
db.close() db.close()
def search_history(query, domain=None, start_date=None, end_date=None, db=None):
"""
Search history using FTS5 with proper ranking
"""
if db is None:
db = next(get_db())
try:
# Build the FTS query
fts_query = f'"{query}"' # Exact phrase
if domain:
fts_query += f' AND domain:"{domain}"'
# Build date filter conditions
date_conditions = []
params = {'query': query}
if start_date:
date_conditions.append("visit_time >= :start_date")
params['start_date'] = start_date
if end_date:
date_conditions.append("visit_time <= :end_date")
params['end_date'] = end_date
date_filter = f"AND {' AND '.join(date_conditions)}" if date_conditions else ""
# Execute the search query
sql_query = f"""
SELECT
h.*,
bm25(history_fts) as rank,
highlight(history_fts, 0, '<mark>', '</mark>') as title_highlight,
highlight(history_fts, 1, '<mark>', '</mark>') as content_highlight
FROM history_fts
JOIN history h ON history_fts.rowid = h.id
WHERE history_fts MATCH :query
{date_filter}
ORDER BY rank, visit_time DESC
LIMIT 100
"""
results = db.execute(text(sql_query), params).fetchall()
return results
except Exception as e:
print(f"Search error: {e}")
return []
def recreate_fts_tables():
"""Drop and recreate the FTS tables"""
conn = engine.raw_connection()
cursor = conn.cursor()
try:
# Drop existing FTS table and triggers
cursor.execute("DROP TRIGGER IF EXISTS history_ai")
cursor.execute("DROP TRIGGER IF EXISTS history_ad")
cursor.execute("DROP TRIGGER IF EXISTS history_au")
cursor.execute("DROP TABLE IF EXISTS history_fts")
# Recreate FTS tables and triggers
init_fts()
# Reindex all existing content
cursor.execute("""
INSERT INTO history_fts(rowid, title, markdown_content, domain, visit_time)
SELECT id, title, markdown_content, domain, visit_time FROM history
""")
conn.commit()
print("Successfully recreated FTS tables and reindexed content")
except Exception as e:
conn.rollback()
print(f"Error recreating FTS tables: {e}")
finally:
cursor.close()
conn.close()

View File

@@ -23,12 +23,14 @@ from .database import (
get_last_processed_timestamp, get_last_processed_timestamp,
update_last_processed_timestamp, update_last_processed_timestamp,
create_tables, create_tables,
engine engine,
recreate_fts_tables
) )
from .scheduler import HistoryScheduler from .scheduler import HistoryScheduler
from .page_info import PageInfo from .page_info import PageInfo
from .page_reader import PageReader from .page_reader import PageReader
from .config import Config from .config import Config
from sqlalchemy.ext.declarative import declarative_base
logger = setup_logger(__name__) logger = setup_logger(__name__)
@@ -52,21 +54,45 @@ app.mount("/static", StaticFiles(directory="app/static"), name="static")
async def startup_event(): async def startup_event():
logger.info("Starting application") logger.info("Starting application")
# Create necessary tables
create_tables()
# Initial history and bookmark fetch
try: try:
# Process history # First create the base tables
logger.info("Creating base tables...")
create_tables()
# # Drop and recreate FTS tables
# logger.info("Recreating FTS tables...")
# with engine.connect() as conn:
# # First check if the main history table exists
# result = conn.execute(text(
# "SELECT name FROM sqlite_master WHERE type='table' AND name='history'"
# )).fetchone()
# if not result:
# logger.info("Main history table doesn't exist yet, creating tables...")
# Base.metadata.create_all(bind=engine)
# # Now recreate FTS tables
# logger.info("Dropping and recreating FTS tables...")
# recreate_fts_tables()
# logger.info("FTS tables recreation completed")
# Initial history and bookmark fetch
logger.info("Processing initial browser history...")
process_browser_history() process_browser_history()
# Process bookmarks logger.info("Updating bookmarks...")
await scheduler.update_bookmarks() await scheduler.update_bookmarks()
# Start the background tasks # Start the background tasks
logger.info("Starting background tasks...")
asyncio.create_task(scheduler.update_history()) asyncio.create_task(scheduler.update_history())
logger.info("Startup completed successfully")
except Exception as e: except Exception as e:
logger.error(f"Error during startup: {str(e)}") logger.error(f"Error during startup: {str(e)}", exc_info=True)
raise
def serialize_history_entry(entry, include_content: bool = False): def serialize_history_entry(entry, include_content: bool = False):
"""Serialize a HistoryEntry object to a dictionary""" """Serialize a HistoryEntry object to a dictionary"""
@@ -105,71 +131,54 @@ def serialize_bookmark(bookmark):
@app.get("/history/search") @app.get("/history/search")
async def search_history( async def search_history(
query: Optional[str] = Query(None),
domain: Optional[str] = Query(None), domain: Optional[str] = Query(None),
start_date: Optional[str] = Query(None), start_date: Optional[str] = Query(None),
end_date: Optional[str] = Query(None), end_date: Optional[str] = Query(None),
search_term: Optional[str] = Query(None),
include_content: bool = Query(False), include_content: bool = Query(False),
db: Session = Depends(get_db) db: Session = Depends(get_db)
): ):
"""Search history with optimized full-text search""" """Search history using FTS5"""
try: try:
if search_term: if query:
# Modified query to handle title-only searches better # Build the FTS query
fts_query = """ fts_conditions = [f'title:{query}* OR markdown_content:{query}*']
WITH RECURSIVE params = {'query': query}
ranked_results AS (
SELECT DISTINCT h.*, if domain:
CASE fts_conditions.append(f'domain:"{domain}"')
-- Boost exact title matches highest
WHEN h.title LIKE :exact_pattern THEN 4.0 fts_query = ' AND '.join(fts_conditions)
-- Boost title prefix matches
WHEN h.title LIKE :prefix_pattern THEN 3.0 # Build the SQL query
-- Boost title contains matches sql = """
WHEN h.title LIKE :like_pattern THEN 2.0 SELECT
-- Lower boost for content matches h.*,
WHEN h.markdown_content IS NOT NULL AND ( bm25(history_fts) as rank,
h.markdown_content LIKE :exact_pattern OR highlight(history_fts, 0, '<mark>', '</mark>') as title_highlight,
h.markdown_content LIKE :prefix_pattern OR highlight(history_fts, 1, '<mark>', '</mark>') as content_highlight
h.markdown_content LIKE :like_pattern FROM history_fts
) THEN 1.0 JOIN history h ON history_fts.rowid = h.id
ELSE 0.5 WHERE history_fts MATCH :fts_query
END * (
CAST(strftime('%s', h.visit_time) AS INTEGER) /
CAST(strftime('%s', 'now') AS INTEGER) * 0.5 + 1
) as final_rank
FROM history h
LEFT JOIN history_fts f ON h.id = f.rowid
WHERE
h.title LIKE :like_pattern
OR (h.markdown_content IS NOT NULL AND history_fts MATCH :search)
AND (:domain IS NULL OR h.domain = :domain)
AND (:start_date IS NULL OR h.visit_time >= :start_date)
AND (:end_date IS NULL OR h.visit_time <= :end_date)
)
SELECT * FROM ranked_results
WHERE final_rank > 0
ORDER BY final_rank DESC
LIMIT 100
""" """
# Prepare search patterns for different matching strategies # Add date filters if provided
params = { if start_date:
'search': f'{search_term}*', # Wildcard suffix matching sql += " AND h.visit_time >= :start_date"
'like_pattern': f'%{search_term}%', # Contains matching params['start_date'] = start_date
'exact_pattern': search_term, # Exact matching if end_date:
'prefix_pattern': f'{search_term}%', # Prefix matching sql += " AND h.visit_time <= :end_date"
'domain': domain, params['end_date'] = end_date
'start_date': start_date,
'end_date': end_date sql += " ORDER BY rank, h.visit_time DESC LIMIT 100"
}
params['fts_query'] = fts_query
results = db.execute(text(sql), params).fetchall()
return [serialize_history_entry(row, include_content) for row in results]
# Execute with connection context manager
with engine.connect() as connection:
results = connection.execute(text(fts_query), params).all()
return [serialize_history_entry(row, include_content) for row in results]
else: else:
# Optimize non-FTS query # Handle non-search queries
query = db.query(HistoryEntry) query = db.query(HistoryEntry)
if domain: if domain:
@@ -179,8 +188,6 @@ async def search_history(
if end_date: if end_date:
query = query.filter(HistoryEntry.visit_time <= end_date) query = query.filter(HistoryEntry.visit_time <= end_date)
# Add index hints and limit
query = query.with_hint(HistoryEntry, 'INDEXED BY ix_history_visit_time', 'sqlite')
entries = query.order_by(HistoryEntry.visit_time.desc()).limit(100).all() entries = query.order_by(HistoryEntry.visit_time.desc()).limit(100).all()
return [serialize_history_entry(entry, include_content) for entry in entries] return [serialize_history_entry(entry, include_content) for entry in entries]