mirror of
https://github.com/Zetaphor/browser-recall.git
synced 2025-12-06 10:29:38 +00:00
142 lines
5.9 KiB
Python
142 lines
5.9 KiB
Python
from fastapi import BackgroundTasks
|
|
from datetime import datetime, timedelta
|
|
import asyncio
|
|
from .database import SessionLocal, HistoryEntry, Bookmark, get_last_processed_timestamp, update_last_processed_timestamp
|
|
from .browser import BrowserHistoryCollector
|
|
from .page_reader import PageReader
|
|
from sqlalchemy import func
|
|
from sqlalchemy.orm import Session
|
|
import pytz
|
|
from .config import Config
|
|
from .database import get_db
|
|
from urllib.parse import urlparse
|
|
import logging
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
class HistoryScheduler:
|
|
def __init__(self):
|
|
self.browser_collector = BrowserHistoryCollector()
|
|
self.page_reader = PageReader()
|
|
self.last_history_update = None
|
|
self.content_update_interval = timedelta(hours=24) # Update content daily
|
|
self.config = Config()
|
|
self.db_lock = asyncio.Lock()
|
|
|
|
def _normalize_datetime(self, dt: datetime) -> datetime:
|
|
"""Convert datetime to UTC if it has timezone, or make it timezone-aware if it doesn't"""
|
|
if dt is None:
|
|
return None
|
|
|
|
# If datetime is naive (no timezone), assume it's in UTC
|
|
if dt.tzinfo is None:
|
|
return pytz.UTC.localize(dt)
|
|
|
|
# If datetime has timezone, convert to UTC
|
|
return dt.astimezone(pytz.UTC)
|
|
|
|
async def update_bookmarks(self):
|
|
"""Update bookmarks from browsers"""
|
|
try:
|
|
current_timestamp = int(datetime.now().timestamp())
|
|
source_key = "browser_bookmarks"
|
|
last_timestamp = get_last_processed_timestamp(source_key)
|
|
|
|
logger.info(f"Fetching bookmarks. Last processed timestamp: {last_timestamp}")
|
|
bookmarks = self.browser_collector.fetch_bookmarks()
|
|
logger.info(f"Found {len(bookmarks)} total bookmarks")
|
|
|
|
# Filter for only new bookmarks
|
|
new_bookmarks = [
|
|
(added_time, url, title, folder) for added_time, url, title, folder in bookmarks
|
|
if self._normalize_datetime(added_time).timestamp() > last_timestamp
|
|
]
|
|
|
|
logger.info(f"Found {len(new_bookmarks)} new bookmarks to process")
|
|
|
|
if new_bookmarks:
|
|
async with self.db_lock:
|
|
with next(get_db()) as db:
|
|
added_count = 0
|
|
for added_time, url, title, folder in new_bookmarks:
|
|
domain = urlparse(url).netloc
|
|
if self.config.is_domain_ignored(domain):
|
|
logger.debug(f"Skipping ignored domain: {domain}")
|
|
continue
|
|
|
|
added_time = self._normalize_datetime(added_time)
|
|
|
|
bookmark = Bookmark(
|
|
url=url,
|
|
title=title,
|
|
added_time=added_time,
|
|
folder=folder,
|
|
domain=domain
|
|
)
|
|
db.add(bookmark)
|
|
added_count += 1
|
|
|
|
db.commit()
|
|
logger.info(f"Successfully added {added_count} new bookmarks")
|
|
|
|
update_last_processed_timestamp(source_key, current_timestamp)
|
|
logger.info(f"Updated last processed timestamp to {current_timestamp}")
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error updating bookmarks: {str(e)}", exc_info=True)
|
|
|
|
async def update_history(self):
|
|
"""Background task to update history periodically"""
|
|
while True:
|
|
try:
|
|
current_timestamp = int(datetime.now().timestamp())
|
|
source_key = "browser_history"
|
|
last_timestamp = get_last_processed_timestamp(source_key)
|
|
|
|
logger.info(f"Fetching history. Last processed timestamp: {last_timestamp}")
|
|
history_entries = self.browser_collector.fetch_history()
|
|
logger.info(f"Found {len(history_entries)} total history entries")
|
|
|
|
# Filter for only new entries
|
|
new_entries = [
|
|
(visit_time, url, title) for visit_time, url, title in history_entries
|
|
if self._normalize_datetime(visit_time).timestamp() > last_timestamp
|
|
]
|
|
|
|
logger.info(f"Found {len(new_entries)} new history entries to process")
|
|
|
|
if new_entries:
|
|
async with self.db_lock:
|
|
with next(get_db()) as db:
|
|
added_count = 0
|
|
for visit_time, url, title in new_entries:
|
|
domain = urlparse(url).netloc
|
|
if self.config.is_domain_ignored(domain):
|
|
logger.debug(f"Skipping ignored domain: {domain}")
|
|
continue
|
|
|
|
visit_time = self._normalize_datetime(visit_time)
|
|
|
|
history_entry = HistoryEntry(
|
|
url=url,
|
|
title=title,
|
|
visit_time=visit_time,
|
|
domain=domain
|
|
)
|
|
db.add(history_entry)
|
|
added_count += 1
|
|
|
|
db.commit()
|
|
logger.info(f"Successfully added {added_count} new history entries")
|
|
|
|
update_last_processed_timestamp(source_key, current_timestamp)
|
|
logger.info(f"Updated last processed timestamp to {current_timestamp}")
|
|
|
|
except Exception as e:
|
|
logger.error(f"Error updating history: {str(e)}", exc_info=True)
|
|
|
|
await asyncio.sleep(300) # Wait 5 minutes before next update
|
|
|
|
async def close(self):
|
|
"""Cleanup resources"""
|
|
await self.page_reader.close() |