Fix duplicate records

This commit is contained in:
2025-01-26 01:01:21 -06:00
parent 4714d3d183
commit 687bbb198e
4 changed files with 219 additions and 50 deletions

View File

@@ -1,4 +1,4 @@
from sqlalchemy import create_engine, Column, Integer, String, DateTime, Text, event from sqlalchemy import create_engine, Column, Integer, String, DateTime, Text, event, text
from sqlalchemy.ext.declarative import declarative_base from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker from sqlalchemy.orm import sessionmaker
from datetime import datetime from datetime import datetime
@@ -143,3 +143,49 @@ def get_db():
yield db yield db
finally: finally:
db.close() db.close()
def get_last_processed_timestamp(source):
"""
Get last processed timestamp for a source (e.g., 'chrome_history', 'chrome_bookmarks')
"""
db = next(get_db())
try:
result = db.execute(
text('SELECT last_timestamp FROM last_processed WHERE source = :source'),
{'source': source}
).fetchone()
return result[0] if result else 0
finally:
db.close()
def update_last_processed_timestamp(source, timestamp):
"""
Update last processed timestamp for a source
"""
db = next(get_db())
try:
db.execute(
text('''
INSERT OR REPLACE INTO last_processed (source, last_timestamp)
VALUES (:source, :timestamp)
'''),
{'source': source, 'timestamp': timestamp}
)
db.commit()
finally:
db.close()
def create_tables():
db = next(get_db())
try:
db.execute(
text('''
CREATE TABLE IF NOT EXISTS last_processed (
source TEXT PRIMARY KEY,
last_timestamp INTEGER
)
''')
)
db.commit()
finally:
db.close()

View File

@@ -15,8 +15,16 @@ from .logging_config import setup_logger
from fastapi.templating import Jinja2Templates from fastapi.templating import Jinja2Templates
from fastapi.staticfiles import StaticFiles from fastapi.staticfiles import StaticFiles
from fastapi import Request from fastapi import Request
import browser_history
from .database import get_db, HistoryEntry, Bookmark from .database import (
get_db,
HistoryEntry,
Bookmark,
get_last_processed_timestamp,
update_last_processed_timestamp,
create_tables
)
from .scheduler import HistoryScheduler from .scheduler import HistoryScheduler
from .page_info import PageInfo from .page_info import PageInfo
from .page_reader import PageReader from .page_reader import PageReader
@@ -43,10 +51,22 @@ app.mount("/static", StaticFiles(directory="app/static"), name="static")
@app.on_event("startup") @app.on_event("startup")
async def startup_event(): async def startup_event():
logger.info("Starting application") logger.info("Starting application")
# Initial bookmark fetch
await scheduler.update_bookmarks() # Create necessary tables
# Start the background task create_tables()
asyncio.create_task(scheduler.update_history())
# Initial history and bookmark fetch
try:
# Process history
process_browser_history()
# Process bookmarks
await scheduler.update_bookmarks()
# Start the background tasks
asyncio.create_task(scheduler.update_history())
except Exception as e:
logger.error(f"Error during startup: {str(e)}")
def serialize_history_entry(entry, include_content: bool = False): def serialize_history_entry(entry, include_content: bool = False):
"""Serialize a HistoryEntry object to a dictionary""" """Serialize a HistoryEntry object to a dictionary"""
@@ -379,3 +399,58 @@ async def bookmarks_page(request: Request, db: Session = Depends(get_db)):
"bookmarks.html", "bookmarks.html",
{"request": request, "bookmarks": bookmarks} {"request": request, "bookmarks": bookmarks}
) )
def process_browser_history():
try:
logger.info("Starting browser history processing")
outputs = browser_history.get_history()
history_list = outputs.histories # This is a list of tuples (timestamp, url, title)
logger.info(f"Found {len(history_list)} total history items")
current_timestamp = int(datetime.now().timestamp())
source_key = "browser_history" # Single source since we get combined history
last_timestamp = get_last_processed_timestamp(source_key)
logger.info(f"Last processed timestamp: {last_timestamp}")
# Filter for only new entries
new_entries = [
entry for entry in history_list
if entry[0].timestamp() > last_timestamp
]
logger.info(f"Found {len(new_entries)} new entries")
if new_entries:
for timestamp, url, title in new_entries:
logger.info(f"Processing entry: {timestamp} - {url}")
domain = urlparse(url).netloc
if config.is_domain_ignored(domain):
logger.debug(f"Skipping ignored domain: {domain}")
continue
# Create history entry
db = next(get_db())
try:
history_entry = HistoryEntry(
url=url,
title=title,
visit_time=timestamp,
domain=domain
)
db.add(history_entry)
db.commit()
except Exception as e:
logger.error(f"Error storing history item: {str(e)}")
db.rollback()
finally:
db.close()
# Update the last processed timestamp
update_last_processed_timestamp(source_key, current_timestamp)
logger.info(f"Updated timestamp to {current_timestamp}")
logger.info(f"Processed {len(new_entries)} new items")
except Exception as e:
logger.error(f"Error processing browser history: {str(e)}", exc_info=True)

View File

@@ -1,7 +1,7 @@
from fastapi import BackgroundTasks from fastapi import BackgroundTasks
from datetime import datetime, timedelta from datetime import datetime, timedelta
import asyncio import asyncio
from .database import SessionLocal, HistoryEntry, Bookmark from .database import SessionLocal, HistoryEntry, Bookmark, get_last_processed_timestamp, update_last_processed_timestamp
from .browser import BrowserHistoryCollector from .browser import BrowserHistoryCollector
from .page_reader import PageReader from .page_reader import PageReader
from sqlalchemy import func from sqlalchemy import func
@@ -10,6 +10,9 @@ import pytz
from .config import Config from .config import Config
from .database import get_db from .database import get_db
from urllib.parse import urlparse from urllib.parse import urlparse
import logging
logger = logging.getLogger(__name__)
class HistoryScheduler: class HistoryScheduler:
def __init__(self): def __init__(self):
@@ -18,6 +21,7 @@ class HistoryScheduler:
self.last_history_update = None self.last_history_update = None
self.content_update_interval = timedelta(hours=24) # Update content daily self.content_update_interval = timedelta(hours=24) # Update content daily
self.config = Config() self.config = Config()
self.db_lock = asyncio.Lock()
def _normalize_datetime(self, dt: datetime) -> datetime: def _normalize_datetime(self, dt: datetime) -> datetime:
"""Convert datetime to UTC if it has timezone, or make it timezone-aware if it doesn't""" """Convert datetime to UTC if it has timezone, or make it timezone-aware if it doesn't"""
@@ -32,68 +36,104 @@ class HistoryScheduler:
return dt.astimezone(pytz.UTC) return dt.astimezone(pytz.UTC)
async def update_bookmarks(self): async def update_bookmarks(self):
"""Update bookmarks from browser""" """Update bookmarks from browsers"""
try: try:
db = next(get_db()) current_timestamp = int(datetime.now().timestamp())
source_key = "browser_bookmarks"
last_timestamp = get_last_processed_timestamp(source_key)
logger.info(f"Fetching bookmarks. Last processed timestamp: {last_timestamp}")
bookmarks = self.browser_collector.fetch_bookmarks() bookmarks = self.browser_collector.fetch_bookmarks()
logger.info(f"Found {len(bookmarks)} total bookmarks")
for added_time, url, title, folder in bookmarks: # Unpack the tuple # Filter for only new bookmarks
# Extract domain and check if it should be ignored new_bookmarks = [
domain = urlparse(url).netloc (added_time, url, title, folder) for added_time, url, title, folder in bookmarks
if self.config.is_domain_ignored(domain): if self._normalize_datetime(added_time).timestamp() > last_timestamp
continue ]
# Normalize the datetime logger.info(f"Found {len(new_bookmarks)} new bookmarks to process")
added_time = self._normalize_datetime(added_time)
# Process the bookmark only if domain is not ignored if new_bookmarks:
bookmark_entry = Bookmark( async with self.db_lock:
url=url, with next(get_db()) as db:
title=title, added_count = 0
added_time=added_time, for added_time, url, title, folder in new_bookmarks:
folder=folder, domain = urlparse(url).netloc
domain=domain if self.config.is_domain_ignored(domain):
) logger.debug(f"Skipping ignored domain: {domain}")
db.add(bookmark_entry) continue
db.commit() added_time = self._normalize_datetime(added_time)
bookmark = Bookmark(
url=url,
title=title,
added_time=added_time,
folder=folder,
domain=domain
)
db.add(bookmark)
added_count += 1
db.commit()
logger.info(f"Successfully added {added_count} new bookmarks")
update_last_processed_timestamp(source_key, current_timestamp)
logger.info(f"Updated last processed timestamp to {current_timestamp}")
except Exception as e: except Exception as e:
print(f"Error updating bookmarks: {e}") logger.error(f"Error updating bookmarks: {str(e)}", exc_info=True)
finally:
db.close()
async def update_history(self): async def update_history(self):
"""Background task to update history periodically""" """Background task to update history periodically"""
while True: while True:
try: try:
db = next(get_db()) current_timestamp = int(datetime.now().timestamp())
source_key = "browser_history"
last_timestamp = get_last_processed_timestamp(source_key)
logger.info(f"Fetching history. Last processed timestamp: {last_timestamp}")
history_entries = self.browser_collector.fetch_history() history_entries = self.browser_collector.fetch_history()
logger.info(f"Found {len(history_entries)} total history entries")
for visit_time, url, title in history_entries: # Unpack the tuple # Filter for only new entries
# Extract domain and check if it should be ignored new_entries = [
domain = urlparse(url).netloc (visit_time, url, title) for visit_time, url, title in history_entries
if self.config.is_domain_ignored(domain): if self._normalize_datetime(visit_time).timestamp() > last_timestamp
continue ]
# Normalize the datetime logger.info(f"Found {len(new_entries)} new history entries to process")
visit_time = self._normalize_datetime(visit_time)
# Process the entry only if domain is not ignored if new_entries:
history_entry = HistoryEntry( async with self.db_lock:
url=url, with next(get_db()) as db:
title=title, added_count = 0
visit_time=visit_time, for visit_time, url, title in new_entries:
domain=domain domain = urlparse(url).netloc
) if self.config.is_domain_ignored(domain):
db.add(history_entry) logger.debug(f"Skipping ignored domain: {domain}")
continue
db.commit() visit_time = self._normalize_datetime(visit_time)
history_entry = HistoryEntry(
url=url,
title=title,
visit_time=visit_time,
domain=domain
)
db.add(history_entry)
added_count += 1
db.commit()
logger.info(f"Successfully added {added_count} new history entries")
update_last_processed_timestamp(source_key, current_timestamp)
logger.info(f"Updated last processed timestamp to {current_timestamp}")
except Exception as e: except Exception as e:
print(f"Error updating history: {e}") logger.error(f"Error updating history: {str(e)}", exc_info=True)
finally:
db.close()
await asyncio.sleep(300) # Wait 5 minutes before next update await asyncio.sleep(300) # Wait 5 minutes before next update

8
run-browser-recall.fish Executable file
View File

@@ -0,0 +1,8 @@
#!/usr/bin/env fish
# Activate the virtual environment and run main.py silently
vf activate general
python main.py > /dev/null 2>&1 &
# Print a simple confirmation message
echo "Browser Recall started in background"