mirror of
https://github.com/Zetaphor/browser-recall.git
synced 2025-12-06 02:19:37 +00:00
Update FTS query
This commit is contained in:
102
app/database.py
102
app/database.py
@@ -95,38 +95,40 @@ def init_fts():
|
|||||||
conn = engine.raw_connection()
|
conn = engine.raw_connection()
|
||||||
cursor = conn.cursor()
|
cursor = conn.cursor()
|
||||||
|
|
||||||
# Update FTS table configuration to use trigrams for better partial matching
|
# Create FTS table with content and title columns
|
||||||
cursor.execute("""
|
cursor.execute("""
|
||||||
CREATE VIRTUAL TABLE IF NOT EXISTS history_fts USING fts5(
|
CREATE VIRTUAL TABLE IF NOT EXISTS history_fts USING fts5(
|
||||||
title,
|
title,
|
||||||
markdown_content,
|
markdown_content,
|
||||||
|
domain, -- Add domain for filtering
|
||||||
|
visit_time UNINDEXED, -- Add visit_time but don't index it
|
||||||
content='history',
|
content='history',
|
||||||
content_rowid='id',
|
content_rowid='id',
|
||||||
tokenize='trigram'
|
tokenize='trigram'
|
||||||
)
|
)
|
||||||
""")
|
""")
|
||||||
|
|
||||||
# Create triggers to keep FTS index up to date
|
# Update triggers to include domain and visit_time
|
||||||
cursor.execute("""
|
cursor.execute("""
|
||||||
CREATE TRIGGER IF NOT EXISTS history_ai AFTER INSERT ON history BEGIN
|
CREATE TRIGGER IF NOT EXISTS history_ai AFTER INSERT ON history BEGIN
|
||||||
INSERT INTO history_fts(rowid, title, markdown_content)
|
INSERT INTO history_fts(rowid, title, markdown_content, domain, visit_time)
|
||||||
VALUES (new.id, new.title, new.markdown_content);
|
VALUES (new.id, new.title, new.markdown_content, new.domain, new.visit_time);
|
||||||
END;
|
END;
|
||||||
""")
|
""")
|
||||||
|
|
||||||
cursor.execute("""
|
cursor.execute("""
|
||||||
CREATE TRIGGER IF NOT EXISTS history_ad AFTER DELETE ON history BEGIN
|
CREATE TRIGGER IF NOT EXISTS history_ad AFTER DELETE ON history BEGIN
|
||||||
INSERT INTO history_fts(history_fts, rowid, title, markdown_content)
|
INSERT INTO history_fts(history_fts, rowid, title, markdown_content, domain, visit_time)
|
||||||
VALUES('delete', old.id, old.title, old.markdown_content);
|
VALUES('delete', old.id, old.title, old.markdown_content, old.domain, old.visit_time);
|
||||||
END;
|
END;
|
||||||
""")
|
""")
|
||||||
|
|
||||||
cursor.execute("""
|
cursor.execute("""
|
||||||
CREATE TRIGGER IF NOT EXISTS history_au AFTER UPDATE ON history BEGIN
|
CREATE TRIGGER IF NOT EXISTS history_au AFTER UPDATE ON history BEGIN
|
||||||
INSERT INTO history_fts(history_fts, rowid, title, markdown_content)
|
INSERT INTO history_fts(history_fts, rowid, title, markdown_content, domain, visit_time)
|
||||||
VALUES('delete', old.id, old.title, old.markdown_content);
|
VALUES('delete', old.id, old.title, old.markdown_content, old.domain, old.visit_time);
|
||||||
INSERT INTO history_fts(rowid, title, markdown_content)
|
INSERT INTO history_fts(rowid, title, markdown_content, domain, visit_time)
|
||||||
VALUES (new.id, new.title, new.markdown_content);
|
VALUES (new.id, new.title, new.markdown_content, new.domain, new.visit_time);
|
||||||
END;
|
END;
|
||||||
""")
|
""")
|
||||||
|
|
||||||
@@ -198,4 +200,82 @@ def create_tables():
|
|||||||
)
|
)
|
||||||
db.commit()
|
db.commit()
|
||||||
finally:
|
finally:
|
||||||
db.close()
|
db.close()
|
||||||
|
|
||||||
|
def search_history(query, domain=None, start_date=None, end_date=None, db=None):
|
||||||
|
"""
|
||||||
|
Search history using FTS5 with proper ranking
|
||||||
|
"""
|
||||||
|
if db is None:
|
||||||
|
db = next(get_db())
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Build the FTS query
|
||||||
|
fts_query = f'"{query}"' # Exact phrase
|
||||||
|
if domain:
|
||||||
|
fts_query += f' AND domain:"{domain}"'
|
||||||
|
|
||||||
|
# Build date filter conditions
|
||||||
|
date_conditions = []
|
||||||
|
params = {'query': query}
|
||||||
|
|
||||||
|
if start_date:
|
||||||
|
date_conditions.append("visit_time >= :start_date")
|
||||||
|
params['start_date'] = start_date
|
||||||
|
if end_date:
|
||||||
|
date_conditions.append("visit_time <= :end_date")
|
||||||
|
params['end_date'] = end_date
|
||||||
|
|
||||||
|
date_filter = f"AND {' AND '.join(date_conditions)}" if date_conditions else ""
|
||||||
|
|
||||||
|
# Execute the search query
|
||||||
|
sql_query = f"""
|
||||||
|
SELECT
|
||||||
|
h.*,
|
||||||
|
bm25(history_fts) as rank,
|
||||||
|
highlight(history_fts, 0, '<mark>', '</mark>') as title_highlight,
|
||||||
|
highlight(history_fts, 1, '<mark>', '</mark>') as content_highlight
|
||||||
|
FROM history_fts
|
||||||
|
JOIN history h ON history_fts.rowid = h.id
|
||||||
|
WHERE history_fts MATCH :query
|
||||||
|
{date_filter}
|
||||||
|
ORDER BY rank, visit_time DESC
|
||||||
|
LIMIT 100
|
||||||
|
"""
|
||||||
|
|
||||||
|
results = db.execute(text(sql_query), params).fetchall()
|
||||||
|
return results
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Search error: {e}")
|
||||||
|
return []
|
||||||
|
|
||||||
|
def recreate_fts_tables():
|
||||||
|
"""Drop and recreate the FTS tables"""
|
||||||
|
conn = engine.raw_connection()
|
||||||
|
cursor = conn.cursor()
|
||||||
|
try:
|
||||||
|
# Drop existing FTS table and triggers
|
||||||
|
cursor.execute("DROP TRIGGER IF EXISTS history_ai")
|
||||||
|
cursor.execute("DROP TRIGGER IF EXISTS history_ad")
|
||||||
|
cursor.execute("DROP TRIGGER IF EXISTS history_au")
|
||||||
|
cursor.execute("DROP TABLE IF EXISTS history_fts")
|
||||||
|
|
||||||
|
# Recreate FTS tables and triggers
|
||||||
|
init_fts()
|
||||||
|
|
||||||
|
# Reindex all existing content
|
||||||
|
cursor.execute("""
|
||||||
|
INSERT INTO history_fts(rowid, title, markdown_content, domain, visit_time)
|
||||||
|
SELECT id, title, markdown_content, domain, visit_time FROM history
|
||||||
|
""")
|
||||||
|
|
||||||
|
conn.commit()
|
||||||
|
print("Successfully recreated FTS tables and reindexed content")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
conn.rollback()
|
||||||
|
print(f"Error recreating FTS tables: {e}")
|
||||||
|
finally:
|
||||||
|
cursor.close()
|
||||||
|
conn.close()
|
||||||
135
app/main.py
135
app/main.py
@@ -23,12 +23,14 @@ from .database import (
|
|||||||
get_last_processed_timestamp,
|
get_last_processed_timestamp,
|
||||||
update_last_processed_timestamp,
|
update_last_processed_timestamp,
|
||||||
create_tables,
|
create_tables,
|
||||||
engine
|
engine,
|
||||||
|
recreate_fts_tables
|
||||||
)
|
)
|
||||||
from .scheduler import HistoryScheduler
|
from .scheduler import HistoryScheduler
|
||||||
from .page_info import PageInfo
|
from .page_info import PageInfo
|
||||||
from .page_reader import PageReader
|
from .page_reader import PageReader
|
||||||
from .config import Config
|
from .config import Config
|
||||||
|
from sqlalchemy.ext.declarative import declarative_base
|
||||||
|
|
||||||
logger = setup_logger(__name__)
|
logger = setup_logger(__name__)
|
||||||
|
|
||||||
@@ -52,21 +54,45 @@ app.mount("/static", StaticFiles(directory="app/static"), name="static")
|
|||||||
async def startup_event():
|
async def startup_event():
|
||||||
logger.info("Starting application")
|
logger.info("Starting application")
|
||||||
|
|
||||||
# Create necessary tables
|
|
||||||
create_tables()
|
|
||||||
|
|
||||||
# Initial history and bookmark fetch
|
|
||||||
try:
|
try:
|
||||||
# Process history
|
# First create the base tables
|
||||||
|
logger.info("Creating base tables...")
|
||||||
|
create_tables()
|
||||||
|
|
||||||
|
# # Drop and recreate FTS tables
|
||||||
|
# logger.info("Recreating FTS tables...")
|
||||||
|
# with engine.connect() as conn:
|
||||||
|
# # First check if the main history table exists
|
||||||
|
# result = conn.execute(text(
|
||||||
|
# "SELECT name FROM sqlite_master WHERE type='table' AND name='history'"
|
||||||
|
# )).fetchone()
|
||||||
|
|
||||||
|
# if not result:
|
||||||
|
# logger.info("Main history table doesn't exist yet, creating tables...")
|
||||||
|
# Base.metadata.create_all(bind=engine)
|
||||||
|
|
||||||
|
# # Now recreate FTS tables
|
||||||
|
# logger.info("Dropping and recreating FTS tables...")
|
||||||
|
# recreate_fts_tables()
|
||||||
|
|
||||||
|
# logger.info("FTS tables recreation completed")
|
||||||
|
|
||||||
|
# Initial history and bookmark fetch
|
||||||
|
logger.info("Processing initial browser history...")
|
||||||
process_browser_history()
|
process_browser_history()
|
||||||
|
|
||||||
# Process bookmarks
|
logger.info("Updating bookmarks...")
|
||||||
await scheduler.update_bookmarks()
|
await scheduler.update_bookmarks()
|
||||||
|
|
||||||
# Start the background tasks
|
# Start the background tasks
|
||||||
|
logger.info("Starting background tasks...")
|
||||||
asyncio.create_task(scheduler.update_history())
|
asyncio.create_task(scheduler.update_history())
|
||||||
|
|
||||||
|
logger.info("Startup completed successfully")
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Error during startup: {str(e)}")
|
logger.error(f"Error during startup: {str(e)}", exc_info=True)
|
||||||
|
raise
|
||||||
|
|
||||||
def serialize_history_entry(entry, include_content: bool = False):
|
def serialize_history_entry(entry, include_content: bool = False):
|
||||||
"""Serialize a HistoryEntry object to a dictionary"""
|
"""Serialize a HistoryEntry object to a dictionary"""
|
||||||
@@ -105,71 +131,54 @@ def serialize_bookmark(bookmark):
|
|||||||
|
|
||||||
@app.get("/history/search")
|
@app.get("/history/search")
|
||||||
async def search_history(
|
async def search_history(
|
||||||
|
query: Optional[str] = Query(None),
|
||||||
domain: Optional[str] = Query(None),
|
domain: Optional[str] = Query(None),
|
||||||
start_date: Optional[str] = Query(None),
|
start_date: Optional[str] = Query(None),
|
||||||
end_date: Optional[str] = Query(None),
|
end_date: Optional[str] = Query(None),
|
||||||
search_term: Optional[str] = Query(None),
|
|
||||||
include_content: bool = Query(False),
|
include_content: bool = Query(False),
|
||||||
db: Session = Depends(get_db)
|
db: Session = Depends(get_db)
|
||||||
):
|
):
|
||||||
"""Search history with optimized full-text search"""
|
"""Search history using FTS5"""
|
||||||
try:
|
try:
|
||||||
if search_term:
|
if query:
|
||||||
# Modified query to handle title-only searches better
|
# Build the FTS query
|
||||||
fts_query = """
|
fts_conditions = [f'title:{query}* OR markdown_content:{query}*']
|
||||||
WITH RECURSIVE
|
params = {'query': query}
|
||||||
ranked_results AS (
|
|
||||||
SELECT DISTINCT h.*,
|
if domain:
|
||||||
CASE
|
fts_conditions.append(f'domain:"{domain}"')
|
||||||
-- Boost exact title matches highest
|
|
||||||
WHEN h.title LIKE :exact_pattern THEN 4.0
|
fts_query = ' AND '.join(fts_conditions)
|
||||||
-- Boost title prefix matches
|
|
||||||
WHEN h.title LIKE :prefix_pattern THEN 3.0
|
# Build the SQL query
|
||||||
-- Boost title contains matches
|
sql = """
|
||||||
WHEN h.title LIKE :like_pattern THEN 2.0
|
SELECT
|
||||||
-- Lower boost for content matches
|
h.*,
|
||||||
WHEN h.markdown_content IS NOT NULL AND (
|
bm25(history_fts) as rank,
|
||||||
h.markdown_content LIKE :exact_pattern OR
|
highlight(history_fts, 0, '<mark>', '</mark>') as title_highlight,
|
||||||
h.markdown_content LIKE :prefix_pattern OR
|
highlight(history_fts, 1, '<mark>', '</mark>') as content_highlight
|
||||||
h.markdown_content LIKE :like_pattern
|
FROM history_fts
|
||||||
) THEN 1.0
|
JOIN history h ON history_fts.rowid = h.id
|
||||||
ELSE 0.5
|
WHERE history_fts MATCH :fts_query
|
||||||
END * (
|
|
||||||
CAST(strftime('%s', h.visit_time) AS INTEGER) /
|
|
||||||
CAST(strftime('%s', 'now') AS INTEGER) * 0.5 + 1
|
|
||||||
) as final_rank
|
|
||||||
FROM history h
|
|
||||||
LEFT JOIN history_fts f ON h.id = f.rowid
|
|
||||||
WHERE
|
|
||||||
h.title LIKE :like_pattern
|
|
||||||
OR (h.markdown_content IS NOT NULL AND history_fts MATCH :search)
|
|
||||||
AND (:domain IS NULL OR h.domain = :domain)
|
|
||||||
AND (:start_date IS NULL OR h.visit_time >= :start_date)
|
|
||||||
AND (:end_date IS NULL OR h.visit_time <= :end_date)
|
|
||||||
)
|
|
||||||
SELECT * FROM ranked_results
|
|
||||||
WHERE final_rank > 0
|
|
||||||
ORDER BY final_rank DESC
|
|
||||||
LIMIT 100
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# Prepare search patterns for different matching strategies
|
# Add date filters if provided
|
||||||
params = {
|
if start_date:
|
||||||
'search': f'{search_term}*', # Wildcard suffix matching
|
sql += " AND h.visit_time >= :start_date"
|
||||||
'like_pattern': f'%{search_term}%', # Contains matching
|
params['start_date'] = start_date
|
||||||
'exact_pattern': search_term, # Exact matching
|
if end_date:
|
||||||
'prefix_pattern': f'{search_term}%', # Prefix matching
|
sql += " AND h.visit_time <= :end_date"
|
||||||
'domain': domain,
|
params['end_date'] = end_date
|
||||||
'start_date': start_date,
|
|
||||||
'end_date': end_date
|
sql += " ORDER BY rank, h.visit_time DESC LIMIT 100"
|
||||||
}
|
|
||||||
|
params['fts_query'] = fts_query
|
||||||
|
|
||||||
|
results = db.execute(text(sql), params).fetchall()
|
||||||
|
return [serialize_history_entry(row, include_content) for row in results]
|
||||||
|
|
||||||
# Execute with connection context manager
|
|
||||||
with engine.connect() as connection:
|
|
||||||
results = connection.execute(text(fts_query), params).all()
|
|
||||||
return [serialize_history_entry(row, include_content) for row in results]
|
|
||||||
else:
|
else:
|
||||||
# Optimize non-FTS query
|
# Handle non-search queries
|
||||||
query = db.query(HistoryEntry)
|
query = db.query(HistoryEntry)
|
||||||
|
|
||||||
if domain:
|
if domain:
|
||||||
@@ -179,8 +188,6 @@ async def search_history(
|
|||||||
if end_date:
|
if end_date:
|
||||||
query = query.filter(HistoryEntry.visit_time <= end_date)
|
query = query.filter(HistoryEntry.visit_time <= end_date)
|
||||||
|
|
||||||
# Add index hints and limit
|
|
||||||
query = query.with_hint(HistoryEntry, 'INDEXED BY ix_history_visit_time', 'sqlite')
|
|
||||||
entries = query.order_by(HistoryEntry.visit_time.desc()).limit(100).all()
|
entries = query.order_by(HistoryEntry.visit_time.desc()).limit(100).all()
|
||||||
return [serialize_history_entry(entry, include_content) for entry in entries]
|
return [serialize_history_entry(entry, include_content) for entry in entries]
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user