diff --git a/.python-version b/.python-version
index ac957df..251b350 100644
--- a/.python-version
+++ b/.python-version
@@ -1 +1 @@
-3.10.6
+3.10.16
diff --git a/app/config.py b/app/config.py
index 17fe9d7..a51836b 100644
--- a/app/config.py
+++ b/app/config.py
@@ -2,6 +2,10 @@ import yaml
from pathlib import Path
from typing import Set
import fnmatch
+import os
+import logging
+
+logger = logging.getLogger(__name__)
class Config:
def __init__(self):
@@ -127,4 +131,121 @@ class ReaderConfig:
if domain_part != pattern_part:
return False
- return True
\ No newline at end of file
+ return True
+
+DEFAULT_CONFIG_PATH = 'config/reader_config.yaml'
+USER_CONFIG_DIR = os.path.expanduser("~/.config/browser-recall")
+USER_CONFIG_PATH = os.path.join(USER_CONFIG_DIR, 'reader_config.yaml')
+
+class Config:
+ _instance = None
+
+ def __new__(cls, *args, **kwargs):
+ if not cls._instance:
+ cls._instance = super(Config, cls).__new__(cls)
+ cls._instance._initialized = False
+ return cls._instance
+
+ def __init__(self, config_path=None):
+ if self._initialized:
+ return
+ self._initialized = True
+
+ self.config_path = self._determine_config_path(config_path)
+ self.config_data = self._load_config()
+ logger.info(f"Config initialized using: {self.config_path}")
+ # Pre-process excluded domains for faster lookup if needed,
+ # but direct iteration with fnmatch is often fine for moderate lists.
+ self.excluded_domains = self.config_data.get('excluded_domains', [])
+ # Ensure it's a list
+ if not isinstance(self.excluded_domains, list):
+ logger.warning(f"Excluded domains in config is not a list: {self.excluded_domains}. Ignoring.")
+ self.excluded_domains = []
+
+
+ def _determine_config_path(self, provided_path):
+ """Determine the correct config path to use."""
+ if provided_path and os.path.exists(provided_path):
+ return provided_path
+ if os.path.exists(USER_CONFIG_PATH):
+ return USER_CONFIG_PATH
+ if os.path.exists(DEFAULT_CONFIG_PATH):
+ return DEFAULT_CONFIG_PATH
+ logger.warning("No configuration file found at default or user locations. Using empty config.")
+ return None # Indicate no file was found
+
+ def _load_config(self):
+ """Loads the YAML configuration file."""
+ if not self.config_path:
+ return {} # Return empty dict if no config file path determined
+
+ try:
+ with open(self.config_path, 'r') as f:
+ return yaml.safe_load(f) or {} # Return empty dict if file is empty
+ except FileNotFoundError:
+ logger.warning(f"Configuration file not found at {self.config_path}. Using default settings.")
+ return {}
+ except yaml.YAMLError as e:
+ logger.error(f"Error parsing configuration file {self.config_path}: {e}")
+ return {} # Return empty dict on parsing error
+ except Exception as e:
+ logger.error(f"Unexpected error loading configuration {self.config_path}: {e}")
+ return {}
+
+ def get_config(self):
+ """Returns the loaded configuration data."""
+ return self.config_data
+
+ def reload_config(self):
+ """Reloads the configuration from the file."""
+ logger.info(f"Reloading configuration from: {self.config_path}")
+ self.config_data = self._load_config()
+ self.excluded_domains = self.config_data.get('excluded_domains', [])
+ if not isinstance(self.excluded_domains, list):
+ logger.warning(f"Excluded domains in reloaded config is not a list: {self.excluded_domains}. Ignoring.")
+ self.excluded_domains = []
+ logger.info("Configuration reloaded.")
+
+
+ def is_domain_ignored(self, domain: str) -> bool:
+ """
+ Checks if a given domain matches any pattern in the excluded_domains list.
+ Supports exact matches and wildcard (*) matching using fnmatch.
+ """
+ if not domain: # Ignore empty domains
+ return True
+ if not self.excluded_domains: # If list is empty, nothing is ignored
+ return False
+
+ # Normalize domain to lowercase for case-insensitive comparison
+ domain_lower = domain.lower()
+
+ for pattern in self.excluded_domains:
+ if not isinstance(pattern, str): # Skip non-string patterns
+ continue
+
+ # Normalize pattern to lowercase
+ pattern_lower = pattern.lower()
+
+ # Use fnmatch.fnmatch for wildcard support (*)
+ if fnmatch.fnmatch(domain_lower, pattern_lower):
+ # logger.debug(f"Domain '{domain}' ignored due to pattern '{pattern}'")
+ return True
+ return False
+
+ # --- Add methods to get specific config values safely ---
+ @property
+ def history_update_interval_seconds(self) -> int:
+ """Gets the history update interval, defaulting to 300."""
+ return self.config_data.get('history_update_interval_seconds', 300)
+
+ @property
+ def markdown_update_interval_seconds(self) -> int:
+ """Gets the markdown update interval, defaulting to 300."""
+ return self.config_data.get('markdown_update_interval_seconds', 300)
+
+ # Add other specific getters as needed
+ # Example:
+ # @property
+ # def some_other_setting(self) -> str:
+ # return self.config_data.get('some_other_setting', 'default_value')
\ No newline at end of file
diff --git a/app/main.py b/app/main.py
index debcf91..c0cafb1 100644
--- a/app/main.py
+++ b/app/main.py
@@ -1,493 +1,293 @@
-from fastapi import FastAPI, Depends, Query, WebSocket, WebSocketDisconnect, HTTPException
+from fastapi import FastAPI, Depends
from sqlalchemy.orm import Session
-from datetime import datetime, timezone, timedelta
-from typing import List, Optional
+from datetime import datetime, timezone
+from typing import Optional
import asyncio
-from fastapi import WebSocketDisconnect
from urllib.parse import urlparse
-import pytz
from fastapi.middleware.cors import CORSMiddleware
-import iso8601
-from bs4 import BeautifulSoup
-from sqlalchemy import text
-from sqlalchemy.sql import text
-from .logging_config import setup_logger
-from fastapi.templating import Jinja2Templates
from fastapi.staticfiles import StaticFiles
-from fastapi import Request
import browser_history
+from crawl4ai import AsyncWebCrawler
+
+# Local imports
+from .logging_config import setup_logger
from .database import (
get_db,
HistoryEntry,
- Bookmark,
get_last_processed_timestamp,
update_last_processed_timestamp,
create_tables,
engine,
- recreate_fts_tables
+ # recreate_fts_tables # Keep if needed, but often done manually or via migration tool
)
-from .scheduler import HistoryScheduler
-from .page_info import PageInfo
-from .page_reader import PageReader
from .config import Config
-from sqlalchemy.ext.declarative import declarative_base
+
+# Import Routers
+from .routers import history, bookmarks, config as api_config, websocket, ui
logger = setup_logger(__name__)
-app = FastAPI()
-scheduler = HistoryScheduler()
-config = Config()
+# --- Global Variables ---
+# These are accessed by other modules (like websocket router)
+# Consider using app state or dependency injection for cleaner management if complexity grows
+config_manager = Config() # Renamed to avoid conflict with router import
+crawler: Optional[AsyncWebCrawler] = None
-# Add CORS middleware to allow WebSocket connections
+# Import scheduler *after* crawler is defined
+from .scheduler import HistoryScheduler
+scheduler: Optional[HistoryScheduler] = None # Now initialize scheduler variable
+
+# --- FastAPI App Initialization ---
+app = FastAPI(title="Browser History Search API")
+
+# Add CORS middleware
app.add_middleware(
CORSMiddleware,
- allow_origins=["*"], # In production, specify your domains
+ allow_origins=["*"], # Adjust in production
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
-templates = Jinja2Templates(directory="app/templates")
+# Mount static files and templates
app.mount("/static", StaticFiles(directory="app/static"), name="static")
+# Note: Templates are used within the ui router now, no need for global instance here unless used elsewhere
-@app.on_event("startup")
-async def startup_event():
- logger.info("Starting application")
-
- try:
- # First create the base tables
- logger.info("Creating base tables...")
- create_tables()
-
- # # Drop and recreate FTS tables
- # logger.info("Recreating FTS tables...")
- # with engine.connect() as conn:
- # # First check if the main history table exists
- # result = conn.execute(text(
- # "SELECT name FROM sqlite_master WHERE type='table' AND name='history'"
- # )).fetchone()
-
- # if not result:
- # logger.info("Main history table doesn't exist yet, creating tables...")
- # Base.metadata.create_all(bind=engine)
-
- # # Now recreate FTS tables
- # logger.info("Dropping and recreating FTS tables...")
- # recreate_fts_tables()
-
- # logger.info("FTS tables recreation completed")
-
- # Initial history and bookmark fetch
- logger.info("Processing initial browser history...")
- process_browser_history()
-
- logger.info("Updating bookmarks...")
- await scheduler.update_bookmarks()
-
- # Start the background tasks
- logger.info("Starting background tasks...")
- asyncio.create_task(scheduler.update_history())
-
- logger.info("Startup completed successfully")
-
- except Exception as e:
- logger.error(f"Error during startup: {str(e)}", exc_info=True)
- raise
-
-def serialize_history_entry(entry, include_content: bool = False):
- """Serialize a HistoryEntry object to a dictionary"""
- # Handle both ORM objects and raw SQL results
- if hasattr(entry, '_mapping'): # Raw SQL result
- result = {
- "id": entry.id,
- "url": entry.url,
- "title": entry.title,
- "visit_time": entry.visit_time.isoformat() if isinstance(entry.visit_time, datetime) else entry.visit_time,
- "domain": entry.domain,
- }
- else: # ORM object
- result = {
- "id": entry.id,
- "url": entry.url,
- "title": entry.title,
- "visit_time": entry.visit_time.isoformat() if entry.visit_time else None,
- "domain": entry.domain,
- }
-
- if include_content:
- result["markdown_content"] = entry.markdown_content
- return result
-
-def serialize_bookmark(bookmark):
- """Serialize a Bookmark object to a dictionary"""
- return {
- "id": bookmark.id,
- "url": bookmark.url,
- "title": bookmark.title,
- "added_time": bookmark.added_time.isoformat() if bookmark.added_time else None,
- "folder": bookmark.folder,
- "domain": bookmark.domain,
- }
-
-@app.get("/history/search")
-async def search_history(
- query: Optional[str] = Query(None),
- domain: Optional[str] = Query(None),
- start_date: Optional[str] = Query(None),
- end_date: Optional[str] = Query(None),
- include_content: bool = Query(False),
- db: Session = Depends(get_db)
-):
- """Search history using FTS5"""
- try:
- if query:
- # Build the FTS query
- fts_conditions = [f'title:{query}* OR markdown_content:{query}*']
- params = {'query': query}
-
- if domain:
- fts_conditions.append(f'domain:"{domain}"')
-
- fts_query = ' AND '.join(fts_conditions)
-
- # Build the SQL query
- sql = """
- SELECT
- h.*,
- bm25(history_fts) as rank,
- highlight(history_fts, 0, '', '') as title_highlight,
- highlight(history_fts, 1, '', '') as content_highlight
- FROM history_fts
- JOIN history h ON history_fts.rowid = h.id
- WHERE history_fts MATCH :fts_query
- """
-
- # Add date filters if provided
- if start_date:
- sql += " AND h.visit_time >= :start_date"
- params['start_date'] = start_date
- if end_date:
- sql += " AND h.visit_time <= :end_date"
- params['end_date'] = end_date
-
- sql += " ORDER BY rank, h.visit_time DESC LIMIT 100"
-
- params['fts_query'] = fts_query
-
- results = db.execute(text(sql), params).fetchall()
- return [serialize_history_entry(row, include_content) for row in results]
-
- else:
- # Handle non-search queries
- query = db.query(HistoryEntry)
-
- if domain:
- query = query.filter(HistoryEntry.domain == domain)
- if start_date:
- query = query.filter(HistoryEntry.visit_time >= start_date)
- if end_date:
- query = query.filter(HistoryEntry.visit_time <= end_date)
-
- entries = query.order_by(HistoryEntry.visit_time.desc()).limit(100).all()
- return [serialize_history_entry(entry, include_content) for entry in entries]
-
- except Exception as e:
- logger.error(f"Search error: {str(e)}", exc_info=True)
- raise HTTPException(
- status_code=500,
- detail={"message": "Search operation failed", "error": str(e)}
- )
-
-@app.get("/bookmarks/search")
-async def search_bookmarks(
- domain: Optional[str] = Query(None),
- folder: Optional[str] = Query(None),
- search_term: Optional[str] = Query(None),
- db: Session = Depends(get_db)
-):
- """Search bookmarks with optimized queries"""
- try:
- # Build query efficiently
- query = db.query(Bookmark)
-
- # Apply filters using index-optimized queries
- if domain:
- query = query.filter(Bookmark.domain == domain)
-
- if folder:
- query = query.filter(Bookmark.folder == folder)
-
- if search_term:
- # Use LIKE with index hint for title search
- search_pattern = f"%{search_term}%"
- query = query.filter(
- Bookmark.title.ilike(search_pattern)
- ).with_hint(
- Bookmark,
- 'INDEXED BY ix_bookmarks_title',
- 'sqlite'
- )
-
- # Add ordering and limit for better performance
- bookmarks = query.order_by(Bookmark.added_time.desc()).limit(1000).all()
-
- return [serialize_bookmark(bookmark) for bookmark in bookmarks]
-
- except Exception as e:
- print(f"Bookmark search error: {e}")
- raise HTTPException(status_code=500, detail="Search operation failed")
-
-# Add new endpoint for advanced full-text search
-@app.get("/history/search/advanced")
-async def advanced_history_search(
- query: str = Query(..., description="Full-text search query with SQLite FTS5 syntax"),
- include_content: bool = Query(False),
- db: Session = Depends(get_db)
-):
- """Advanced full-text search using SQLite FTS5 features"""
- try:
- # Use raw SQL for advanced FTS query
- fts_query = """
- SELECT h.*, rank
- FROM history h
- INNER JOIN history_fts f ON h.id = f.rowid
- WHERE history_fts MATCH :query
- ORDER BY rank
- LIMIT 1000
- """
-
- results = db.execute(text(fts_query), {'query': query}).all()
-
- # Convert results to HistoryEntry objects
- entries = [
- serialize_history_entry(
- HistoryEntry(
- id=row.id,
- url=row.url,
- title=row.title,
- visit_time=row.visit_time,
- domain=row.domain,
- markdown_content=row.markdown_content if include_content else None
- ),
- include_content
- )
- for row in results
- ]
-
- return entries
-
- except Exception as e:
- print(f"Advanced search error: {e}")
- raise HTTPException(status_code=500, detail="Advanced search operation failed")
-
-@app.websocket("/ws")
-async def websocket_endpoint(websocket: WebSocket, db: Session = Depends(get_db)):
- logger.info("New WebSocket connection established")
- page_reader = PageReader()
- await websocket.accept()
- try:
- while True:
- data = await websocket.receive_json()
-
- # Parse the URL and check if domain should be ignored
- domain = urlparse(data['url']).netloc
- if config.is_domain_ignored(domain):
- logger.info(f"Ignoring domain: {domain}")
- await websocket.send_json({
- "status": "ignored",
- "message": f"Domain {domain} is in ignore list"
- })
- continue
-
- logger.info(f"Processing page: {data['url']}")
- timestamp = iso8601.parse_date(data['timestamp'])
-
- # Check if we already have a recent entry for this URL
- existing_entry = db.query(HistoryEntry).filter(
- HistoryEntry.url == data['url'],
- HistoryEntry.visit_time >= timestamp - timedelta(minutes=5)
- ).first()
-
- if existing_entry:
- print(f"Recent entry exists for URL: {data['url']}")
- await websocket.send_json({
- "status": "skipped",
- "message": "Recent entry exists"
- })
- continue
-
- page_info = PageInfo(
- url=data['url'],
- html=data['html'],
- timestamp=timestamp
- )
-
- # Debug HTML content
- print(f"HTML content length before processing: {len(page_info.html)}")
-
- # Extract title
- soup = BeautifulSoup(page_info.html, 'html.parser')
- title = soup.title.string if soup.title else ''
- print(f"Extracted title: {title}")
-
- # Debug markdown conversion
- print("Starting markdown conversion...")
- cleaned_html = page_reader.clean_html(page_info.html)
- print(f"Cleaned HTML length: {len(cleaned_html)}")
-
- markdown_content = page_reader.html_to_markdown(page_info.html)
- print(f"Markdown conversion complete. Content length: {len(markdown_content) if markdown_content else 0}")
-
- if markdown_content:
- print("First 100 chars of markdown:", markdown_content[:100])
- else:
- print("No markdown content generated")
-
- if not title and not markdown_content:
- print(f"No content extracted from: {page_info.url}")
- await websocket.send_json({
- "status": "skipped",
- "message": "No content extracted"
- })
- continue
-
- # Create history entry
- history_entry = HistoryEntry(
- url=page_info.url,
- title=title,
- visit_time=page_info.timestamp,
- domain=domain,
- markdown_content=markdown_content,
- last_content_update=datetime.now(timezone.utc)
- )
-
- # Debug database operation
- print(f"Saving entry with markdown length: {len(markdown_content) if markdown_content else 0}")
-
- # Use bulk operations for better performance
- db.add(history_entry)
-
- try:
- db.commit()
- print(f"Successfully saved entry for: {page_info.url}")
- print(f"Verify markdown content length in database: {len(history_entry.markdown_content) if history_entry.markdown_content else 0}")
- await websocket.send_json({
- "status": "success",
- "message": f"Processed page: {page_info.url}"
- })
- except Exception as e:
- db.rollback()
- print(f"Error saving entry: {e}")
- await websocket.send_json({
- "status": "error",
- "message": "Database error"
- })
-
- except WebSocketDisconnect:
- logger.info("Client disconnected")
- except Exception as e:
- logger.error("Error in WebSocket handler", exc_info=True)
- finally:
- await page_reader.close()
-
-@app.get("/config/ignored-domains")
-async def get_ignored_domains():
- """Get list of ignored domain patterns"""
- return {"ignored_domains": config.config.get('ignored_domains', [])}
-
-@app.post("/config/ignored-domains")
-async def add_ignored_domain(pattern: str):
- """Add a new domain pattern to ignored list"""
- config.add_ignored_domain(pattern)
- return {"status": "success", "message": f"Added pattern: {pattern}"}
-
-@app.delete("/config/ignored-domains/{pattern}")
-async def remove_ignored_domain(pattern: str):
- """Remove a domain pattern from ignored list"""
- config.remove_ignored_domain(pattern)
- return {"status": "success", "message": f"Removed pattern: {pattern}"}
-
-@app.get("/")
-async def home(request: Request, db: Session = Depends(get_db)):
- # Get recent history entries
- entries = db.query(HistoryEntry)\
- .order_by(HistoryEntry.visit_time.desc())\
- .limit(50)\
- .all()
- return templates.TemplateResponse(
- "index.html",
- {"request": request, "entries": entries}
- )
-
-@app.get("/search")
-async def search_page(request: Request):
- return templates.TemplateResponse(
- "search.html",
- {"request": request}
- )
-
-@app.get("/bookmarks")
-async def bookmarks_page(request: Request, db: Session = Depends(get_db)):
- bookmarks = db.query(Bookmark)\
- .order_by(Bookmark.added_time.desc())\
- .limit(50)\
- .all()
- return templates.TemplateResponse(
- "bookmarks.html",
- {"request": request, "bookmarks": bookmarks}
- )
-
+# --- Helper Function (Initial Sync) ---
def process_browser_history():
+ """Fetches and stores new history entries from browser_history library (Initial Sync)."""
try:
- logger.info("Starting browser history processing")
+ logger.info("Starting browser history processing (initial sync)")
outputs = browser_history.get_history()
- history_list = outputs.histories # This is a list of tuples (timestamp, url, title)
- logger.info(f"Found {len(history_list)} total history items")
+ # browser_history returns platform specific History object, get histories list
+ history_list = []
+ if hasattr(outputs, 'histories') and outputs.histories:
+ history_list = outputs.histories # List of (datetime, url, title)
+ else:
+ logger.warning("Could not retrieve histories list from browser_history output.")
+ return # Exit if no history list found
- current_timestamp = int(datetime.now().timestamp())
- source_key = "browser_history" # Single source since we get combined history
- last_timestamp = get_last_processed_timestamp(source_key)
+ logger.info(f"Found {len(history_list)} total history items from browser_history library")
- logger.info(f"Last processed timestamp: {last_timestamp}")
+ current_timestamp_dt = datetime.now(timezone.utc)
+ current_timestamp = int(current_timestamp_dt.timestamp()) # Use timezone-aware timestamp
+ source_key = "browser_history_sync" # Differentiate from scheduler source
+ last_timestamp = get_last_processed_timestamp(source_key) or 0 # Ensure it's 0 if None
- # Filter for only new entries
- new_entries = [
- entry for entry in history_list
- if entry[0].timestamp() > last_timestamp
- ]
+ logger.info(f"Last processed timestamp for initial sync '{source_key}': {last_timestamp}")
- logger.info(f"Found {len(new_entries)} new entries")
+ new_entries = []
+ processed_urls_times = set() # Avoid duplicates within the batch
+
+ for entry in history_list:
+ # Basic validation of entry structure
+ if not isinstance(entry, (tuple, list)) or len(entry) < 2:
+ logger.warning(f"Skipping malformed history entry: {entry}")
+ continue
+ timestamp, url = entry[0], entry[1]
+ title = entry[2] if len(entry) > 2 else "" # Handle optional title
+
+ if not url or not timestamp:
+ logger.warning(f"Skipping entry with missing URL or timestamp: Title='{title}'")
+ continue
+
+ # Ensure timestamp is datetime object
+ if not isinstance(timestamp, datetime):
+ logger.warning(f"Skipping entry with non-datetime timestamp ({type(timestamp)}): {url}")
+ continue
+
+ # Normalize timestamp (Assume local if naive, convert to UTC)
+ if timestamp.tzinfo is None or timestamp.tzinfo.utcoffset(timestamp) is None:
+ try:
+ timestamp_aware = timestamp.astimezone() # Make aware using system local
+ except Exception as tz_err:
+ logger.warning(f"Could not determine local timezone for naive timestamp {timestamp}. Assuming UTC. Error: {tz_err}")
+ timestamp_aware = timestamp.replace(tzinfo=timezone.utc) # Fallback to UTC
+ else:
+ timestamp_aware = timestamp
+ timestamp_utc = timestamp_aware.astimezone(timezone.utc)
+
+
+ # Filter for only new entries based on normalized UTC timestamp
+ if timestamp_utc.timestamp() > last_timestamp:
+ entry_key = (url, timestamp_utc.timestamp())
+ if entry_key in processed_urls_times:
+ continue # Skip duplicate within this batch
+
+ new_entries.append((timestamp_utc, url, title))
+ processed_urls_times.add(entry_key)
+
+ logger.info(f"Found {len(new_entries)} new entries for initial sync after filtering")
if new_entries:
- for timestamp, url, title in new_entries:
- logger.info(f"Processing entry: {timestamp} - {url}")
- domain = urlparse(url).netloc
- if config.is_domain_ignored(domain):
- logger.debug(f"Skipping ignored domain: {domain}")
- continue
-
- # Create history entry
- db = next(get_db())
+ added_count = 0
+ skipped_ignored = 0
+ # Use context manager for session
+ with next(get_db()) as db:
try:
- history_entry = HistoryEntry(
- url=url,
- title=title,
- visit_time=timestamp,
- domain=domain
- )
- db.add(history_entry)
- db.commit()
+ for timestamp_utc, url, title in new_entries:
+ domain = urlparse(url).netloc
+ if config_manager.is_domain_ignored(domain):
+ # logger.debug(f"Skipping ignored domain during initial sync: {domain}")
+ skipped_ignored += 1
+ continue
+
+ # Optional: Check if entry already exists more robustly
+ # existing = db.query(HistoryEntry.id).filter(HistoryEntry.url == url, HistoryEntry.visit_time == timestamp_utc).first()
+ # if existing:
+ # continue
+
+ history_entry = HistoryEntry(
+ url=url,
+ title=title or "", # Ensure title is not None
+ visit_time=timestamp_utc,
+ domain=domain
+ # Note: No markdown content here, only basic history
+ )
+ db.add(history_entry)
+ added_count += 1
+
+ if added_count > 0:
+ db.commit()
+ logger.info(f"Committed {added_count} new history entries from initial sync.")
+ # Update the last processed timestamp only if successful commit
+ update_last_processed_timestamp(source_key, current_timestamp)
+ logger.info(f"Updated initial sync timestamp for '{source_key}' to {current_timestamp}")
+ else:
+ logger.info("No new unique entries to commit during initial sync.")
+ # Update timestamp even if nothing new added, to mark sync time
+ update_last_processed_timestamp(source_key, current_timestamp)
+ logger.info(f"Updated initial sync timestamp check for '{source_key}' to {current_timestamp}")
+
+
+ if skipped_ignored > 0:
+ logger.info(f"Skipped {skipped_ignored} entries due to ignored domains during initial sync.")
+
except Exception as e:
- logger.error(f"Error storing history item: {str(e)}")
+ logger.error(f"Error storing history item during initial sync: {str(e)}", exc_info=True)
db.rollback()
- finally:
- db.close()
+ else:
+ logger.info("No new history entries found during initial sync.")
+ # Update timestamp even if nothing new found, to mark sync time
+ update_last_processed_timestamp(source_key, current_timestamp)
+ logger.info(f"Updated initial sync timestamp check for '{source_key}' to {current_timestamp}")
- # Update the last processed timestamp
- update_last_processed_timestamp(source_key, current_timestamp)
- logger.info(f"Updated timestamp to {current_timestamp}")
- logger.info(f"Processed {len(new_entries)} new items")
+ except ImportError:
+ logger.warning("`browser_history` library not found or import failed. Skipping initial sync.")
+ except Exception as e:
+ logger.error(f"Error processing browser history during initial sync: {str(e)}", exc_info=True)
+
+
+# --- Startup and Shutdown Events ---
+@app.on_event("startup")
+async def startup_event():
+ global crawler, scheduler # Allow modification of globals
+ logger.info("Starting application initialization...")
+
+ try:
+ # 1. Ensure base tables exist
+ logger.info("Ensuring base tables exist...")
+ create_tables()
+
+ # 2. Initialize the crawler
+ logger.info("Initializing AsyncWebCrawler...")
+ if crawler is None:
+ crawler = AsyncWebCrawler()
+ logger.info("AsyncWebCrawler initialized.")
+
+ # 3. Initialize the Scheduler *after* the crawler
+ logger.info("Initializing HistoryScheduler...")
+ if scheduler is None:
+ scheduler = HistoryScheduler(crawler=crawler) # Pass crawler instance
+ logger.info("HistoryScheduler initialized.")
+
+ # 4. Perform initial history sync from browser_history library
+ logger.info("Performing initial browser history sync...")
+ process_browser_history() # Sync history not processed before
+
+ # 5. Perform initial bookmark sync (using scheduler's method)
+ # Run in background to avoid blocking startup if it takes long
+ logger.info("Starting initial bookmark sync task...")
+ asyncio.create_task(scheduler.update_bookmarks())
+
+ # 6. Start background tasks (scheduler for ongoing updates)
+ logger.info("Starting background history update task...")
+ asyncio.create_task(scheduler.update_history())
+
+ # --- Markdown Update Tasks ---
+ # 7a. Trigger ONE initial batch processing run in the background
+ logger.info("Starting initial markdown processing batch task...")
+ asyncio.create_task(scheduler._process_markdown_batch()) # Run one batch now
+
+ # 7b. Start the PERIODIC background markdown update task
+ logger.info("Starting periodic background markdown update task...")
+ # Use the renamed method for the loop
+ asyncio.create_task(scheduler.update_missing_markdown_periodically())
+ # --- End Markdown Update Tasks ---
+
+
+ logger.info("Application startup sequence initiated. Background tasks running.")
except Exception as e:
- logger.error(f"Error processing browser history: {str(e)}", exc_info=True)
\ No newline at end of file
+ logger.error(f"FATAL ERROR during application startup: {str(e)}", exc_info=True)
+ raise RuntimeError(f"Application startup failed: {e}") from e
+
+
+@app.on_event("shutdown")
+async def shutdown_event():
+ global crawler, scheduler
+ logger.info("Starting application shutdown...")
+
+ # Stop scheduler tasks gracefully if possible (implement cancellation in tasks if needed)
+ # For now, we just close resources
+
+ # Close scheduler resources
+ if scheduler and hasattr(scheduler, 'close'):
+ try:
+ logger.info("Closing scheduler resources...")
+ await scheduler.close() # Call the scheduler's close method
+ except Exception as e:
+ logger.error(f"Error closing scheduler: {e}", exc_info=True)
+
+ # Close crawler if needed (check crawl4ai docs for explicit close method)
+ # Based on previous code, seems no explicit close needed, but keep check just in case
+ if crawler and hasattr(crawler, 'aclose'):
+ try:
+ logger.info("Closing AsyncWebCrawler...")
+ # await crawler.aclose() # Example if an async close exists
+ except Exception as e:
+ logger.error(f"Error closing crawler: {e}", exc_info=True)
+
+
+ # Close database engine connections if necessary (usually handled automatically by SQLAlchemy)
+ # if engine and hasattr(engine, 'dispose'): # Check if using async engine that needs dispose
+ # await engine.dispose()
+
+ logger.info("Application shutdown complete.")
+
+
+# --- Include Routers ---
+app.include_router(history.router)
+app.include_router(bookmarks.router)
+app.include_router(api_config.router)
+app.include_router(websocket.router)
+app.include_router(ui.router)
+
+# Optional: Add a root endpoint for health check or basic info
+@app.get("/health", tags=["service"])
+async def health_check():
+ # Extended health check could verify DB connection or task status
+ db_ok = False
+ try:
+ with next(get_db()) as db:
+ db.execute("SELECT 1")
+ db_ok = True
+ except Exception:
+ db_ok = False
+
+ return {
+ "status": "ok",
+ "database_connection": "ok" if db_ok else "error",
+ # Add other checks as needed
+ }
\ No newline at end of file
diff --git a/app/page_reader.py b/app/page_reader.py
deleted file mode 100644
index 0edcf27..0000000
--- a/app/page_reader.py
+++ /dev/null
@@ -1,117 +0,0 @@
-import re
-from markdownify import markdownify as md
-from bs4 import BeautifulSoup
-from typing import Optional
-from urllib.parse import urlparse
-from .config import ReaderConfig
-from .logging_config import setup_logger
-from .database import SessionLocal
-
-# Setup logger for this module
-logger = setup_logger(__name__)
-
-# Patterns for cleaning
-SCRIPT_PATTERN = r"<[ ]*script.*?\/[ ]*script[ ]*>"
-STYLE_PATTERN = r"<[ ]*style.*?\/[ ]*style[ ]*>"
-META_PATTERN = r"<[ ]*meta.*?>"
-COMMENT_PATTERN = r"<[ ]*!--.*?--[ ]*>"
-LINK_PATTERN = r"<[ ]*link.*?>"
-BASE64_IMG_PATTERN = r'
]+src="data:image/[^;]+;base64,[^"]+"[^>]*>'
-SVG_PATTERN = r"(