From 80516440d7ec3d37f12fbb36b199af67b401c561 Mon Sep 17 00:00:00 2001 From: Zetaphor Date: Wed, 29 Jan 2025 20:28:32 -0600 Subject: [PATCH] Update FTS query --- app/database.py | 102 ++++++++++++++++++++++++++++++++---- app/main.py | 135 +++++++++++++++++++++++++----------------------- 2 files changed, 162 insertions(+), 75 deletions(-) diff --git a/app/database.py b/app/database.py index 2ed03a0..a77994a 100644 --- a/app/database.py +++ b/app/database.py @@ -95,38 +95,40 @@ def init_fts(): conn = engine.raw_connection() cursor = conn.cursor() - # Update FTS table configuration to use trigrams for better partial matching + # Create FTS table with content and title columns cursor.execute(""" CREATE VIRTUAL TABLE IF NOT EXISTS history_fts USING fts5( title, markdown_content, + domain, -- Add domain for filtering + visit_time UNINDEXED, -- Add visit_time but don't index it content='history', content_rowid='id', tokenize='trigram' ) """) - # Create triggers to keep FTS index up to date + # Update triggers to include domain and visit_time cursor.execute(""" CREATE TRIGGER IF NOT EXISTS history_ai AFTER INSERT ON history BEGIN - INSERT INTO history_fts(rowid, title, markdown_content) - VALUES (new.id, new.title, new.markdown_content); + INSERT INTO history_fts(rowid, title, markdown_content, domain, visit_time) + VALUES (new.id, new.title, new.markdown_content, new.domain, new.visit_time); END; """) cursor.execute(""" CREATE TRIGGER IF NOT EXISTS history_ad AFTER DELETE ON history BEGIN - INSERT INTO history_fts(history_fts, rowid, title, markdown_content) - VALUES('delete', old.id, old.title, old.markdown_content); + INSERT INTO history_fts(history_fts, rowid, title, markdown_content, domain, visit_time) + VALUES('delete', old.id, old.title, old.markdown_content, old.domain, old.visit_time); END; """) cursor.execute(""" CREATE TRIGGER IF NOT EXISTS history_au AFTER UPDATE ON history BEGIN - INSERT INTO history_fts(history_fts, rowid, title, markdown_content) - VALUES('delete', old.id, old.title, old.markdown_content); - INSERT INTO history_fts(rowid, title, markdown_content) - VALUES (new.id, new.title, new.markdown_content); + INSERT INTO history_fts(history_fts, rowid, title, markdown_content, domain, visit_time) + VALUES('delete', old.id, old.title, old.markdown_content, old.domain, old.visit_time); + INSERT INTO history_fts(rowid, title, markdown_content, domain, visit_time) + VALUES (new.id, new.title, new.markdown_content, new.domain, new.visit_time); END; """) @@ -198,4 +200,82 @@ def create_tables(): ) db.commit() finally: - db.close() \ No newline at end of file + db.close() + +def search_history(query, domain=None, start_date=None, end_date=None, db=None): + """ + Search history using FTS5 with proper ranking + """ + if db is None: + db = next(get_db()) + + try: + # Build the FTS query + fts_query = f'"{query}"' # Exact phrase + if domain: + fts_query += f' AND domain:"{domain}"' + + # Build date filter conditions + date_conditions = [] + params = {'query': query} + + if start_date: + date_conditions.append("visit_time >= :start_date") + params['start_date'] = start_date + if end_date: + date_conditions.append("visit_time <= :end_date") + params['end_date'] = end_date + + date_filter = f"AND {' AND '.join(date_conditions)}" if date_conditions else "" + + # Execute the search query + sql_query = f""" + SELECT + h.*, + bm25(history_fts) as rank, + highlight(history_fts, 0, '', '') as title_highlight, + highlight(history_fts, 1, '', '') as content_highlight + FROM history_fts + JOIN history h ON history_fts.rowid = h.id + WHERE history_fts MATCH :query + {date_filter} + ORDER BY rank, visit_time DESC + LIMIT 100 + """ + + results = db.execute(text(sql_query), params).fetchall() + return results + + except Exception as e: + print(f"Search error: {e}") + return [] + +def recreate_fts_tables(): + """Drop and recreate the FTS tables""" + conn = engine.raw_connection() + cursor = conn.cursor() + try: + # Drop existing FTS table and triggers + cursor.execute("DROP TRIGGER IF EXISTS history_ai") + cursor.execute("DROP TRIGGER IF EXISTS history_ad") + cursor.execute("DROP TRIGGER IF EXISTS history_au") + cursor.execute("DROP TABLE IF EXISTS history_fts") + + # Recreate FTS tables and triggers + init_fts() + + # Reindex all existing content + cursor.execute(""" + INSERT INTO history_fts(rowid, title, markdown_content, domain, visit_time) + SELECT id, title, markdown_content, domain, visit_time FROM history + """) + + conn.commit() + print("Successfully recreated FTS tables and reindexed content") + + except Exception as e: + conn.rollback() + print(f"Error recreating FTS tables: {e}") + finally: + cursor.close() + conn.close() \ No newline at end of file diff --git a/app/main.py b/app/main.py index 1b403e8..debcf91 100644 --- a/app/main.py +++ b/app/main.py @@ -23,12 +23,14 @@ from .database import ( get_last_processed_timestamp, update_last_processed_timestamp, create_tables, - engine + engine, + recreate_fts_tables ) from .scheduler import HistoryScheduler from .page_info import PageInfo from .page_reader import PageReader from .config import Config +from sqlalchemy.ext.declarative import declarative_base logger = setup_logger(__name__) @@ -52,21 +54,45 @@ app.mount("/static", StaticFiles(directory="app/static"), name="static") async def startup_event(): logger.info("Starting application") - # Create necessary tables - create_tables() - - # Initial history and bookmark fetch try: - # Process history + # First create the base tables + logger.info("Creating base tables...") + create_tables() + + # # Drop and recreate FTS tables + # logger.info("Recreating FTS tables...") + # with engine.connect() as conn: + # # First check if the main history table exists + # result = conn.execute(text( + # "SELECT name FROM sqlite_master WHERE type='table' AND name='history'" + # )).fetchone() + + # if not result: + # logger.info("Main history table doesn't exist yet, creating tables...") + # Base.metadata.create_all(bind=engine) + + # # Now recreate FTS tables + # logger.info("Dropping and recreating FTS tables...") + # recreate_fts_tables() + + # logger.info("FTS tables recreation completed") + + # Initial history and bookmark fetch + logger.info("Processing initial browser history...") process_browser_history() - # Process bookmarks + logger.info("Updating bookmarks...") await scheduler.update_bookmarks() # Start the background tasks + logger.info("Starting background tasks...") asyncio.create_task(scheduler.update_history()) + + logger.info("Startup completed successfully") + except Exception as e: - logger.error(f"Error during startup: {str(e)}") + logger.error(f"Error during startup: {str(e)}", exc_info=True) + raise def serialize_history_entry(entry, include_content: bool = False): """Serialize a HistoryEntry object to a dictionary""" @@ -105,71 +131,54 @@ def serialize_bookmark(bookmark): @app.get("/history/search") async def search_history( + query: Optional[str] = Query(None), domain: Optional[str] = Query(None), start_date: Optional[str] = Query(None), end_date: Optional[str] = Query(None), - search_term: Optional[str] = Query(None), include_content: bool = Query(False), db: Session = Depends(get_db) ): - """Search history with optimized full-text search""" + """Search history using FTS5""" try: - if search_term: - # Modified query to handle title-only searches better - fts_query = """ - WITH RECURSIVE - ranked_results AS ( - SELECT DISTINCT h.*, - CASE - -- Boost exact title matches highest - WHEN h.title LIKE :exact_pattern THEN 4.0 - -- Boost title prefix matches - WHEN h.title LIKE :prefix_pattern THEN 3.0 - -- Boost title contains matches - WHEN h.title LIKE :like_pattern THEN 2.0 - -- Lower boost for content matches - WHEN h.markdown_content IS NOT NULL AND ( - h.markdown_content LIKE :exact_pattern OR - h.markdown_content LIKE :prefix_pattern OR - h.markdown_content LIKE :like_pattern - ) THEN 1.0 - ELSE 0.5 - END * ( - CAST(strftime('%s', h.visit_time) AS INTEGER) / - CAST(strftime('%s', 'now') AS INTEGER) * 0.5 + 1 - ) as final_rank - FROM history h - LEFT JOIN history_fts f ON h.id = f.rowid - WHERE - h.title LIKE :like_pattern - OR (h.markdown_content IS NOT NULL AND history_fts MATCH :search) - AND (:domain IS NULL OR h.domain = :domain) - AND (:start_date IS NULL OR h.visit_time >= :start_date) - AND (:end_date IS NULL OR h.visit_time <= :end_date) - ) - SELECT * FROM ranked_results - WHERE final_rank > 0 - ORDER BY final_rank DESC - LIMIT 100 + if query: + # Build the FTS query + fts_conditions = [f'title:{query}* OR markdown_content:{query}*'] + params = {'query': query} + + if domain: + fts_conditions.append(f'domain:"{domain}"') + + fts_query = ' AND '.join(fts_conditions) + + # Build the SQL query + sql = """ + SELECT + h.*, + bm25(history_fts) as rank, + highlight(history_fts, 0, '', '') as title_highlight, + highlight(history_fts, 1, '', '') as content_highlight + FROM history_fts + JOIN history h ON history_fts.rowid = h.id + WHERE history_fts MATCH :fts_query """ - # Prepare search patterns for different matching strategies - params = { - 'search': f'{search_term}*', # Wildcard suffix matching - 'like_pattern': f'%{search_term}%', # Contains matching - 'exact_pattern': search_term, # Exact matching - 'prefix_pattern': f'{search_term}%', # Prefix matching - 'domain': domain, - 'start_date': start_date, - 'end_date': end_date - } + # Add date filters if provided + if start_date: + sql += " AND h.visit_time >= :start_date" + params['start_date'] = start_date + if end_date: + sql += " AND h.visit_time <= :end_date" + params['end_date'] = end_date + + sql += " ORDER BY rank, h.visit_time DESC LIMIT 100" + + params['fts_query'] = fts_query + + results = db.execute(text(sql), params).fetchall() + return [serialize_history_entry(row, include_content) for row in results] - # Execute with connection context manager - with engine.connect() as connection: - results = connection.execute(text(fts_query), params).all() - return [serialize_history_entry(row, include_content) for row in results] else: - # Optimize non-FTS query + # Handle non-search queries query = db.query(HistoryEntry) if domain: @@ -179,8 +188,6 @@ async def search_history( if end_date: query = query.filter(HistoryEntry.visit_time <= end_date) - # Add index hints and limit - query = query.with_hint(HistoryEntry, 'INDEXED BY ix_history_visit_time', 'sqlite') entries = query.order_by(HistoryEntry.visit_time.desc()).limit(100).all() return [serialize_history_entry(entry, include_content) for entry in entries]