mirror of
https://github.com/Zetaphor/browser-recall.git
synced 2025-12-06 02:19:37 +00:00
Fix duplicate records
This commit is contained in:
@@ -1,4 +1,4 @@
|
|||||||
from sqlalchemy import create_engine, Column, Integer, String, DateTime, Text, event
|
from sqlalchemy import create_engine, Column, Integer, String, DateTime, Text, event, text
|
||||||
from sqlalchemy.ext.declarative import declarative_base
|
from sqlalchemy.ext.declarative import declarative_base
|
||||||
from sqlalchemy.orm import sessionmaker
|
from sqlalchemy.orm import sessionmaker
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
@@ -141,5 +141,51 @@ def get_db():
|
|||||||
db = SessionLocal()
|
db = SessionLocal()
|
||||||
try:
|
try:
|
||||||
yield db
|
yield db
|
||||||
|
finally:
|
||||||
|
db.close()
|
||||||
|
|
||||||
|
def get_last_processed_timestamp(source):
|
||||||
|
"""
|
||||||
|
Get last processed timestamp for a source (e.g., 'chrome_history', 'chrome_bookmarks')
|
||||||
|
"""
|
||||||
|
db = next(get_db())
|
||||||
|
try:
|
||||||
|
result = db.execute(
|
||||||
|
text('SELECT last_timestamp FROM last_processed WHERE source = :source'),
|
||||||
|
{'source': source}
|
||||||
|
).fetchone()
|
||||||
|
return result[0] if result else 0
|
||||||
|
finally:
|
||||||
|
db.close()
|
||||||
|
|
||||||
|
def update_last_processed_timestamp(source, timestamp):
|
||||||
|
"""
|
||||||
|
Update last processed timestamp for a source
|
||||||
|
"""
|
||||||
|
db = next(get_db())
|
||||||
|
try:
|
||||||
|
db.execute(
|
||||||
|
text('''
|
||||||
|
INSERT OR REPLACE INTO last_processed (source, last_timestamp)
|
||||||
|
VALUES (:source, :timestamp)
|
||||||
|
'''),
|
||||||
|
{'source': source, 'timestamp': timestamp}
|
||||||
|
)
|
||||||
|
db.commit()
|
||||||
|
finally:
|
||||||
|
db.close()
|
||||||
|
|
||||||
|
def create_tables():
|
||||||
|
db = next(get_db())
|
||||||
|
try:
|
||||||
|
db.execute(
|
||||||
|
text('''
|
||||||
|
CREATE TABLE IF NOT EXISTS last_processed (
|
||||||
|
source TEXT PRIMARY KEY,
|
||||||
|
last_timestamp INTEGER
|
||||||
|
)
|
||||||
|
''')
|
||||||
|
)
|
||||||
|
db.commit()
|
||||||
finally:
|
finally:
|
||||||
db.close()
|
db.close()
|
||||||
87
app/main.py
87
app/main.py
@@ -15,8 +15,16 @@ from .logging_config import setup_logger
|
|||||||
from fastapi.templating import Jinja2Templates
|
from fastapi.templating import Jinja2Templates
|
||||||
from fastapi.staticfiles import StaticFiles
|
from fastapi.staticfiles import StaticFiles
|
||||||
from fastapi import Request
|
from fastapi import Request
|
||||||
|
import browser_history
|
||||||
|
|
||||||
from .database import get_db, HistoryEntry, Bookmark
|
from .database import (
|
||||||
|
get_db,
|
||||||
|
HistoryEntry,
|
||||||
|
Bookmark,
|
||||||
|
get_last_processed_timestamp,
|
||||||
|
update_last_processed_timestamp,
|
||||||
|
create_tables
|
||||||
|
)
|
||||||
from .scheduler import HistoryScheduler
|
from .scheduler import HistoryScheduler
|
||||||
from .page_info import PageInfo
|
from .page_info import PageInfo
|
||||||
from .page_reader import PageReader
|
from .page_reader import PageReader
|
||||||
@@ -43,10 +51,22 @@ app.mount("/static", StaticFiles(directory="app/static"), name="static")
|
|||||||
@app.on_event("startup")
|
@app.on_event("startup")
|
||||||
async def startup_event():
|
async def startup_event():
|
||||||
logger.info("Starting application")
|
logger.info("Starting application")
|
||||||
# Initial bookmark fetch
|
|
||||||
await scheduler.update_bookmarks()
|
# Create necessary tables
|
||||||
# Start the background task
|
create_tables()
|
||||||
asyncio.create_task(scheduler.update_history())
|
|
||||||
|
# Initial history and bookmark fetch
|
||||||
|
try:
|
||||||
|
# Process history
|
||||||
|
process_browser_history()
|
||||||
|
|
||||||
|
# Process bookmarks
|
||||||
|
await scheduler.update_bookmarks()
|
||||||
|
|
||||||
|
# Start the background tasks
|
||||||
|
asyncio.create_task(scheduler.update_history())
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error during startup: {str(e)}")
|
||||||
|
|
||||||
def serialize_history_entry(entry, include_content: bool = False):
|
def serialize_history_entry(entry, include_content: bool = False):
|
||||||
"""Serialize a HistoryEntry object to a dictionary"""
|
"""Serialize a HistoryEntry object to a dictionary"""
|
||||||
@@ -378,4 +398,59 @@ async def bookmarks_page(request: Request, db: Session = Depends(get_db)):
|
|||||||
return templates.TemplateResponse(
|
return templates.TemplateResponse(
|
||||||
"bookmarks.html",
|
"bookmarks.html",
|
||||||
{"request": request, "bookmarks": bookmarks}
|
{"request": request, "bookmarks": bookmarks}
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def process_browser_history():
|
||||||
|
try:
|
||||||
|
logger.info("Starting browser history processing")
|
||||||
|
outputs = browser_history.get_history()
|
||||||
|
history_list = outputs.histories # This is a list of tuples (timestamp, url, title)
|
||||||
|
logger.info(f"Found {len(history_list)} total history items")
|
||||||
|
|
||||||
|
current_timestamp = int(datetime.now().timestamp())
|
||||||
|
source_key = "browser_history" # Single source since we get combined history
|
||||||
|
last_timestamp = get_last_processed_timestamp(source_key)
|
||||||
|
|
||||||
|
logger.info(f"Last processed timestamp: {last_timestamp}")
|
||||||
|
|
||||||
|
# Filter for only new entries
|
||||||
|
new_entries = [
|
||||||
|
entry for entry in history_list
|
||||||
|
if entry[0].timestamp() > last_timestamp
|
||||||
|
]
|
||||||
|
|
||||||
|
logger.info(f"Found {len(new_entries)} new entries")
|
||||||
|
|
||||||
|
if new_entries:
|
||||||
|
for timestamp, url, title in new_entries:
|
||||||
|
logger.info(f"Processing entry: {timestamp} - {url}")
|
||||||
|
domain = urlparse(url).netloc
|
||||||
|
if config.is_domain_ignored(domain):
|
||||||
|
logger.debug(f"Skipping ignored domain: {domain}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Create history entry
|
||||||
|
db = next(get_db())
|
||||||
|
try:
|
||||||
|
history_entry = HistoryEntry(
|
||||||
|
url=url,
|
||||||
|
title=title,
|
||||||
|
visit_time=timestamp,
|
||||||
|
domain=domain
|
||||||
|
)
|
||||||
|
db.add(history_entry)
|
||||||
|
db.commit()
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error storing history item: {str(e)}")
|
||||||
|
db.rollback()
|
||||||
|
finally:
|
||||||
|
db.close()
|
||||||
|
|
||||||
|
# Update the last processed timestamp
|
||||||
|
update_last_processed_timestamp(source_key, current_timestamp)
|
||||||
|
logger.info(f"Updated timestamp to {current_timestamp}")
|
||||||
|
|
||||||
|
logger.info(f"Processed {len(new_entries)} new items")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error processing browser history: {str(e)}", exc_info=True)
|
||||||
126
app/scheduler.py
126
app/scheduler.py
@@ -1,7 +1,7 @@
|
|||||||
from fastapi import BackgroundTasks
|
from fastapi import BackgroundTasks
|
||||||
from datetime import datetime, timedelta
|
from datetime import datetime, timedelta
|
||||||
import asyncio
|
import asyncio
|
||||||
from .database import SessionLocal, HistoryEntry, Bookmark
|
from .database import SessionLocal, HistoryEntry, Bookmark, get_last_processed_timestamp, update_last_processed_timestamp
|
||||||
from .browser import BrowserHistoryCollector
|
from .browser import BrowserHistoryCollector
|
||||||
from .page_reader import PageReader
|
from .page_reader import PageReader
|
||||||
from sqlalchemy import func
|
from sqlalchemy import func
|
||||||
@@ -10,6 +10,9 @@ import pytz
|
|||||||
from .config import Config
|
from .config import Config
|
||||||
from .database import get_db
|
from .database import get_db
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
|
import logging
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
class HistoryScheduler:
|
class HistoryScheduler:
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
@@ -18,6 +21,7 @@ class HistoryScheduler:
|
|||||||
self.last_history_update = None
|
self.last_history_update = None
|
||||||
self.content_update_interval = timedelta(hours=24) # Update content daily
|
self.content_update_interval = timedelta(hours=24) # Update content daily
|
||||||
self.config = Config()
|
self.config = Config()
|
||||||
|
self.db_lock = asyncio.Lock()
|
||||||
|
|
||||||
def _normalize_datetime(self, dt: datetime) -> datetime:
|
def _normalize_datetime(self, dt: datetime) -> datetime:
|
||||||
"""Convert datetime to UTC if it has timezone, or make it timezone-aware if it doesn't"""
|
"""Convert datetime to UTC if it has timezone, or make it timezone-aware if it doesn't"""
|
||||||
@@ -32,68 +36,104 @@ class HistoryScheduler:
|
|||||||
return dt.astimezone(pytz.UTC)
|
return dt.astimezone(pytz.UTC)
|
||||||
|
|
||||||
async def update_bookmarks(self):
|
async def update_bookmarks(self):
|
||||||
"""Update bookmarks from browser"""
|
"""Update bookmarks from browsers"""
|
||||||
try:
|
try:
|
||||||
db = next(get_db())
|
current_timestamp = int(datetime.now().timestamp())
|
||||||
|
source_key = "browser_bookmarks"
|
||||||
|
last_timestamp = get_last_processed_timestamp(source_key)
|
||||||
|
|
||||||
|
logger.info(f"Fetching bookmarks. Last processed timestamp: {last_timestamp}")
|
||||||
bookmarks = self.browser_collector.fetch_bookmarks()
|
bookmarks = self.browser_collector.fetch_bookmarks()
|
||||||
|
logger.info(f"Found {len(bookmarks)} total bookmarks")
|
||||||
|
|
||||||
for added_time, url, title, folder in bookmarks: # Unpack the tuple
|
# Filter for only new bookmarks
|
||||||
# Extract domain and check if it should be ignored
|
new_bookmarks = [
|
||||||
domain = urlparse(url).netloc
|
(added_time, url, title, folder) for added_time, url, title, folder in bookmarks
|
||||||
if self.config.is_domain_ignored(domain):
|
if self._normalize_datetime(added_time).timestamp() > last_timestamp
|
||||||
continue
|
]
|
||||||
|
|
||||||
# Normalize the datetime
|
logger.info(f"Found {len(new_bookmarks)} new bookmarks to process")
|
||||||
added_time = self._normalize_datetime(added_time)
|
|
||||||
|
|
||||||
# Process the bookmark only if domain is not ignored
|
if new_bookmarks:
|
||||||
bookmark_entry = Bookmark(
|
async with self.db_lock:
|
||||||
url=url,
|
with next(get_db()) as db:
|
||||||
title=title,
|
added_count = 0
|
||||||
added_time=added_time,
|
for added_time, url, title, folder in new_bookmarks:
|
||||||
folder=folder,
|
domain = urlparse(url).netloc
|
||||||
domain=domain
|
if self.config.is_domain_ignored(domain):
|
||||||
)
|
logger.debug(f"Skipping ignored domain: {domain}")
|
||||||
db.add(bookmark_entry)
|
continue
|
||||||
|
|
||||||
db.commit()
|
added_time = self._normalize_datetime(added_time)
|
||||||
|
|
||||||
|
bookmark = Bookmark(
|
||||||
|
url=url,
|
||||||
|
title=title,
|
||||||
|
added_time=added_time,
|
||||||
|
folder=folder,
|
||||||
|
domain=domain
|
||||||
|
)
|
||||||
|
db.add(bookmark)
|
||||||
|
added_count += 1
|
||||||
|
|
||||||
|
db.commit()
|
||||||
|
logger.info(f"Successfully added {added_count} new bookmarks")
|
||||||
|
|
||||||
|
update_last_processed_timestamp(source_key, current_timestamp)
|
||||||
|
logger.info(f"Updated last processed timestamp to {current_timestamp}")
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Error updating bookmarks: {e}")
|
logger.error(f"Error updating bookmarks: {str(e)}", exc_info=True)
|
||||||
finally:
|
|
||||||
db.close()
|
|
||||||
|
|
||||||
async def update_history(self):
|
async def update_history(self):
|
||||||
"""Background task to update history periodically"""
|
"""Background task to update history periodically"""
|
||||||
while True:
|
while True:
|
||||||
try:
|
try:
|
||||||
db = next(get_db())
|
current_timestamp = int(datetime.now().timestamp())
|
||||||
|
source_key = "browser_history"
|
||||||
|
last_timestamp = get_last_processed_timestamp(source_key)
|
||||||
|
|
||||||
|
logger.info(f"Fetching history. Last processed timestamp: {last_timestamp}")
|
||||||
history_entries = self.browser_collector.fetch_history()
|
history_entries = self.browser_collector.fetch_history()
|
||||||
|
logger.info(f"Found {len(history_entries)} total history entries")
|
||||||
|
|
||||||
for visit_time, url, title in history_entries: # Unpack the tuple
|
# Filter for only new entries
|
||||||
# Extract domain and check if it should be ignored
|
new_entries = [
|
||||||
domain = urlparse(url).netloc
|
(visit_time, url, title) for visit_time, url, title in history_entries
|
||||||
if self.config.is_domain_ignored(domain):
|
if self._normalize_datetime(visit_time).timestamp() > last_timestamp
|
||||||
continue
|
]
|
||||||
|
|
||||||
# Normalize the datetime
|
logger.info(f"Found {len(new_entries)} new history entries to process")
|
||||||
visit_time = self._normalize_datetime(visit_time)
|
|
||||||
|
|
||||||
# Process the entry only if domain is not ignored
|
if new_entries:
|
||||||
history_entry = HistoryEntry(
|
async with self.db_lock:
|
||||||
url=url,
|
with next(get_db()) as db:
|
||||||
title=title,
|
added_count = 0
|
||||||
visit_time=visit_time,
|
for visit_time, url, title in new_entries:
|
||||||
domain=domain
|
domain = urlparse(url).netloc
|
||||||
)
|
if self.config.is_domain_ignored(domain):
|
||||||
db.add(history_entry)
|
logger.debug(f"Skipping ignored domain: {domain}")
|
||||||
|
continue
|
||||||
|
|
||||||
db.commit()
|
visit_time = self._normalize_datetime(visit_time)
|
||||||
|
|
||||||
|
history_entry = HistoryEntry(
|
||||||
|
url=url,
|
||||||
|
title=title,
|
||||||
|
visit_time=visit_time,
|
||||||
|
domain=domain
|
||||||
|
)
|
||||||
|
db.add(history_entry)
|
||||||
|
added_count += 1
|
||||||
|
|
||||||
|
db.commit()
|
||||||
|
logger.info(f"Successfully added {added_count} new history entries")
|
||||||
|
|
||||||
|
update_last_processed_timestamp(source_key, current_timestamp)
|
||||||
|
logger.info(f"Updated last processed timestamp to {current_timestamp}")
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Error updating history: {e}")
|
logger.error(f"Error updating history: {str(e)}", exc_info=True)
|
||||||
finally:
|
|
||||||
db.close()
|
|
||||||
|
|
||||||
await asyncio.sleep(300) # Wait 5 minutes before next update
|
await asyncio.sleep(300) # Wait 5 minutes before next update
|
||||||
|
|
||||||
|
|||||||
8
run-browser-recall.fish
Executable file
8
run-browser-recall.fish
Executable file
@@ -0,0 +1,8 @@
|
|||||||
|
#!/usr/bin/env fish
|
||||||
|
|
||||||
|
# Activate the virtual environment and run main.py silently
|
||||||
|
vf activate general
|
||||||
|
python main.py > /dev/null 2>&1 &
|
||||||
|
|
||||||
|
# Print a simple confirmation message
|
||||||
|
echo "Browser Recall started in background"
|
||||||
Reference in New Issue
Block a user