mirror of
https://github.com/Zetaphor/browser-recall.git
synced 2025-12-06 02:19:37 +00:00
Refactor to use crawl4ai, uv
This commit is contained in:
@@ -1 +1 @@
|
|||||||
3.10.6
|
3.10.16
|
||||||
|
|||||||
121
app/config.py
121
app/config.py
@@ -2,6 +2,10 @@ import yaml
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Set
|
from typing import Set
|
||||||
import fnmatch
|
import fnmatch
|
||||||
|
import os
|
||||||
|
import logging
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
class Config:
|
class Config:
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
@@ -128,3 +132,120 @@ class ReaderConfig:
|
|||||||
return False
|
return False
|
||||||
|
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
DEFAULT_CONFIG_PATH = 'config/reader_config.yaml'
|
||||||
|
USER_CONFIG_DIR = os.path.expanduser("~/.config/browser-recall")
|
||||||
|
USER_CONFIG_PATH = os.path.join(USER_CONFIG_DIR, 'reader_config.yaml')
|
||||||
|
|
||||||
|
class Config:
|
||||||
|
_instance = None
|
||||||
|
|
||||||
|
def __new__(cls, *args, **kwargs):
|
||||||
|
if not cls._instance:
|
||||||
|
cls._instance = super(Config, cls).__new__(cls)
|
||||||
|
cls._instance._initialized = False
|
||||||
|
return cls._instance
|
||||||
|
|
||||||
|
def __init__(self, config_path=None):
|
||||||
|
if self._initialized:
|
||||||
|
return
|
||||||
|
self._initialized = True
|
||||||
|
|
||||||
|
self.config_path = self._determine_config_path(config_path)
|
||||||
|
self.config_data = self._load_config()
|
||||||
|
logger.info(f"Config initialized using: {self.config_path}")
|
||||||
|
# Pre-process excluded domains for faster lookup if needed,
|
||||||
|
# but direct iteration with fnmatch is often fine for moderate lists.
|
||||||
|
self.excluded_domains = self.config_data.get('excluded_domains', [])
|
||||||
|
# Ensure it's a list
|
||||||
|
if not isinstance(self.excluded_domains, list):
|
||||||
|
logger.warning(f"Excluded domains in config is not a list: {self.excluded_domains}. Ignoring.")
|
||||||
|
self.excluded_domains = []
|
||||||
|
|
||||||
|
|
||||||
|
def _determine_config_path(self, provided_path):
|
||||||
|
"""Determine the correct config path to use."""
|
||||||
|
if provided_path and os.path.exists(provided_path):
|
||||||
|
return provided_path
|
||||||
|
if os.path.exists(USER_CONFIG_PATH):
|
||||||
|
return USER_CONFIG_PATH
|
||||||
|
if os.path.exists(DEFAULT_CONFIG_PATH):
|
||||||
|
return DEFAULT_CONFIG_PATH
|
||||||
|
logger.warning("No configuration file found at default or user locations. Using empty config.")
|
||||||
|
return None # Indicate no file was found
|
||||||
|
|
||||||
|
def _load_config(self):
|
||||||
|
"""Loads the YAML configuration file."""
|
||||||
|
if not self.config_path:
|
||||||
|
return {} # Return empty dict if no config file path determined
|
||||||
|
|
||||||
|
try:
|
||||||
|
with open(self.config_path, 'r') as f:
|
||||||
|
return yaml.safe_load(f) or {} # Return empty dict if file is empty
|
||||||
|
except FileNotFoundError:
|
||||||
|
logger.warning(f"Configuration file not found at {self.config_path}. Using default settings.")
|
||||||
|
return {}
|
||||||
|
except yaml.YAMLError as e:
|
||||||
|
logger.error(f"Error parsing configuration file {self.config_path}: {e}")
|
||||||
|
return {} # Return empty dict on parsing error
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Unexpected error loading configuration {self.config_path}: {e}")
|
||||||
|
return {}
|
||||||
|
|
||||||
|
def get_config(self):
|
||||||
|
"""Returns the loaded configuration data."""
|
||||||
|
return self.config_data
|
||||||
|
|
||||||
|
def reload_config(self):
|
||||||
|
"""Reloads the configuration from the file."""
|
||||||
|
logger.info(f"Reloading configuration from: {self.config_path}")
|
||||||
|
self.config_data = self._load_config()
|
||||||
|
self.excluded_domains = self.config_data.get('excluded_domains', [])
|
||||||
|
if not isinstance(self.excluded_domains, list):
|
||||||
|
logger.warning(f"Excluded domains in reloaded config is not a list: {self.excluded_domains}. Ignoring.")
|
||||||
|
self.excluded_domains = []
|
||||||
|
logger.info("Configuration reloaded.")
|
||||||
|
|
||||||
|
|
||||||
|
def is_domain_ignored(self, domain: str) -> bool:
|
||||||
|
"""
|
||||||
|
Checks if a given domain matches any pattern in the excluded_domains list.
|
||||||
|
Supports exact matches and wildcard (*) matching using fnmatch.
|
||||||
|
"""
|
||||||
|
if not domain: # Ignore empty domains
|
||||||
|
return True
|
||||||
|
if not self.excluded_domains: # If list is empty, nothing is ignored
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Normalize domain to lowercase for case-insensitive comparison
|
||||||
|
domain_lower = domain.lower()
|
||||||
|
|
||||||
|
for pattern in self.excluded_domains:
|
||||||
|
if not isinstance(pattern, str): # Skip non-string patterns
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Normalize pattern to lowercase
|
||||||
|
pattern_lower = pattern.lower()
|
||||||
|
|
||||||
|
# Use fnmatch.fnmatch for wildcard support (*)
|
||||||
|
if fnmatch.fnmatch(domain_lower, pattern_lower):
|
||||||
|
# logger.debug(f"Domain '{domain}' ignored due to pattern '{pattern}'")
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
# --- Add methods to get specific config values safely ---
|
||||||
|
@property
|
||||||
|
def history_update_interval_seconds(self) -> int:
|
||||||
|
"""Gets the history update interval, defaulting to 300."""
|
||||||
|
return self.config_data.get('history_update_interval_seconds', 300)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def markdown_update_interval_seconds(self) -> int:
|
||||||
|
"""Gets the markdown update interval, defaulting to 300."""
|
||||||
|
return self.config_data.get('markdown_update_interval_seconds', 300)
|
||||||
|
|
||||||
|
# Add other specific getters as needed
|
||||||
|
# Example:
|
||||||
|
# @property
|
||||||
|
# def some_other_setting(self) -> str:
|
||||||
|
# return self.config_data.get('some_other_setting', 'default_value')
|
||||||
686
app/main.py
686
app/main.py
@@ -1,493 +1,293 @@
|
|||||||
from fastapi import FastAPI, Depends, Query, WebSocket, WebSocketDisconnect, HTTPException
|
from fastapi import FastAPI, Depends
|
||||||
from sqlalchemy.orm import Session
|
from sqlalchemy.orm import Session
|
||||||
from datetime import datetime, timezone, timedelta
|
from datetime import datetime, timezone
|
||||||
from typing import List, Optional
|
from typing import Optional
|
||||||
import asyncio
|
import asyncio
|
||||||
from fastapi import WebSocketDisconnect
|
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
import pytz
|
|
||||||
from fastapi.middleware.cors import CORSMiddleware
|
from fastapi.middleware.cors import CORSMiddleware
|
||||||
import iso8601
|
|
||||||
from bs4 import BeautifulSoup
|
|
||||||
from sqlalchemy import text
|
|
||||||
from sqlalchemy.sql import text
|
|
||||||
from .logging_config import setup_logger
|
|
||||||
from fastapi.templating import Jinja2Templates
|
|
||||||
from fastapi.staticfiles import StaticFiles
|
from fastapi.staticfiles import StaticFiles
|
||||||
from fastapi import Request
|
|
||||||
import browser_history
|
import browser_history
|
||||||
|
from crawl4ai import AsyncWebCrawler
|
||||||
|
|
||||||
|
# Local imports
|
||||||
|
from .logging_config import setup_logger
|
||||||
from .database import (
|
from .database import (
|
||||||
get_db,
|
get_db,
|
||||||
HistoryEntry,
|
HistoryEntry,
|
||||||
Bookmark,
|
|
||||||
get_last_processed_timestamp,
|
get_last_processed_timestamp,
|
||||||
update_last_processed_timestamp,
|
update_last_processed_timestamp,
|
||||||
create_tables,
|
create_tables,
|
||||||
engine,
|
engine,
|
||||||
recreate_fts_tables
|
# recreate_fts_tables # Keep if needed, but often done manually or via migration tool
|
||||||
)
|
)
|
||||||
from .scheduler import HistoryScheduler
|
|
||||||
from .page_info import PageInfo
|
|
||||||
from .page_reader import PageReader
|
|
||||||
from .config import Config
|
from .config import Config
|
||||||
from sqlalchemy.ext.declarative import declarative_base
|
|
||||||
|
# Import Routers
|
||||||
|
from .routers import history, bookmarks, config as api_config, websocket, ui
|
||||||
|
|
||||||
logger = setup_logger(__name__)
|
logger = setup_logger(__name__)
|
||||||
|
|
||||||
app = FastAPI()
|
# --- Global Variables ---
|
||||||
scheduler = HistoryScheduler()
|
# These are accessed by other modules (like websocket router)
|
||||||
config = Config()
|
# Consider using app state or dependency injection for cleaner management if complexity grows
|
||||||
|
config_manager = Config() # Renamed to avoid conflict with router import
|
||||||
|
crawler: Optional[AsyncWebCrawler] = None
|
||||||
|
|
||||||
# Add CORS middleware to allow WebSocket connections
|
# Import scheduler *after* crawler is defined
|
||||||
|
from .scheduler import HistoryScheduler
|
||||||
|
scheduler: Optional[HistoryScheduler] = None # Now initialize scheduler variable
|
||||||
|
|
||||||
|
# --- FastAPI App Initialization ---
|
||||||
|
app = FastAPI(title="Browser History Search API")
|
||||||
|
|
||||||
|
# Add CORS middleware
|
||||||
app.add_middleware(
|
app.add_middleware(
|
||||||
CORSMiddleware,
|
CORSMiddleware,
|
||||||
allow_origins=["*"], # In production, specify your domains
|
allow_origins=["*"], # Adjust in production
|
||||||
allow_credentials=True,
|
allow_credentials=True,
|
||||||
allow_methods=["*"],
|
allow_methods=["*"],
|
||||||
allow_headers=["*"],
|
allow_headers=["*"],
|
||||||
)
|
)
|
||||||
|
|
||||||
templates = Jinja2Templates(directory="app/templates")
|
# Mount static files and templates
|
||||||
app.mount("/static", StaticFiles(directory="app/static"), name="static")
|
app.mount("/static", StaticFiles(directory="app/static"), name="static")
|
||||||
|
# Note: Templates are used within the ui router now, no need for global instance here unless used elsewhere
|
||||||
|
|
||||||
@app.on_event("startup")
|
# --- Helper Function (Initial Sync) ---
|
||||||
async def startup_event():
|
|
||||||
logger.info("Starting application")
|
|
||||||
|
|
||||||
try:
|
|
||||||
# First create the base tables
|
|
||||||
logger.info("Creating base tables...")
|
|
||||||
create_tables()
|
|
||||||
|
|
||||||
# # Drop and recreate FTS tables
|
|
||||||
# logger.info("Recreating FTS tables...")
|
|
||||||
# with engine.connect() as conn:
|
|
||||||
# # First check if the main history table exists
|
|
||||||
# result = conn.execute(text(
|
|
||||||
# "SELECT name FROM sqlite_master WHERE type='table' AND name='history'"
|
|
||||||
# )).fetchone()
|
|
||||||
|
|
||||||
# if not result:
|
|
||||||
# logger.info("Main history table doesn't exist yet, creating tables...")
|
|
||||||
# Base.metadata.create_all(bind=engine)
|
|
||||||
|
|
||||||
# # Now recreate FTS tables
|
|
||||||
# logger.info("Dropping and recreating FTS tables...")
|
|
||||||
# recreate_fts_tables()
|
|
||||||
|
|
||||||
# logger.info("FTS tables recreation completed")
|
|
||||||
|
|
||||||
# Initial history and bookmark fetch
|
|
||||||
logger.info("Processing initial browser history...")
|
|
||||||
process_browser_history()
|
|
||||||
|
|
||||||
logger.info("Updating bookmarks...")
|
|
||||||
await scheduler.update_bookmarks()
|
|
||||||
|
|
||||||
# Start the background tasks
|
|
||||||
logger.info("Starting background tasks...")
|
|
||||||
asyncio.create_task(scheduler.update_history())
|
|
||||||
|
|
||||||
logger.info("Startup completed successfully")
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Error during startup: {str(e)}", exc_info=True)
|
|
||||||
raise
|
|
||||||
|
|
||||||
def serialize_history_entry(entry, include_content: bool = False):
|
|
||||||
"""Serialize a HistoryEntry object to a dictionary"""
|
|
||||||
# Handle both ORM objects and raw SQL results
|
|
||||||
if hasattr(entry, '_mapping'): # Raw SQL result
|
|
||||||
result = {
|
|
||||||
"id": entry.id,
|
|
||||||
"url": entry.url,
|
|
||||||
"title": entry.title,
|
|
||||||
"visit_time": entry.visit_time.isoformat() if isinstance(entry.visit_time, datetime) else entry.visit_time,
|
|
||||||
"domain": entry.domain,
|
|
||||||
}
|
|
||||||
else: # ORM object
|
|
||||||
result = {
|
|
||||||
"id": entry.id,
|
|
||||||
"url": entry.url,
|
|
||||||
"title": entry.title,
|
|
||||||
"visit_time": entry.visit_time.isoformat() if entry.visit_time else None,
|
|
||||||
"domain": entry.domain,
|
|
||||||
}
|
|
||||||
|
|
||||||
if include_content:
|
|
||||||
result["markdown_content"] = entry.markdown_content
|
|
||||||
return result
|
|
||||||
|
|
||||||
def serialize_bookmark(bookmark):
|
|
||||||
"""Serialize a Bookmark object to a dictionary"""
|
|
||||||
return {
|
|
||||||
"id": bookmark.id,
|
|
||||||
"url": bookmark.url,
|
|
||||||
"title": bookmark.title,
|
|
||||||
"added_time": bookmark.added_time.isoformat() if bookmark.added_time else None,
|
|
||||||
"folder": bookmark.folder,
|
|
||||||
"domain": bookmark.domain,
|
|
||||||
}
|
|
||||||
|
|
||||||
@app.get("/history/search")
|
|
||||||
async def search_history(
|
|
||||||
query: Optional[str] = Query(None),
|
|
||||||
domain: Optional[str] = Query(None),
|
|
||||||
start_date: Optional[str] = Query(None),
|
|
||||||
end_date: Optional[str] = Query(None),
|
|
||||||
include_content: bool = Query(False),
|
|
||||||
db: Session = Depends(get_db)
|
|
||||||
):
|
|
||||||
"""Search history using FTS5"""
|
|
||||||
try:
|
|
||||||
if query:
|
|
||||||
# Build the FTS query
|
|
||||||
fts_conditions = [f'title:{query}* OR markdown_content:{query}*']
|
|
||||||
params = {'query': query}
|
|
||||||
|
|
||||||
if domain:
|
|
||||||
fts_conditions.append(f'domain:"{domain}"')
|
|
||||||
|
|
||||||
fts_query = ' AND '.join(fts_conditions)
|
|
||||||
|
|
||||||
# Build the SQL query
|
|
||||||
sql = """
|
|
||||||
SELECT
|
|
||||||
h.*,
|
|
||||||
bm25(history_fts) as rank,
|
|
||||||
highlight(history_fts, 0, '<mark>', '</mark>') as title_highlight,
|
|
||||||
highlight(history_fts, 1, '<mark>', '</mark>') as content_highlight
|
|
||||||
FROM history_fts
|
|
||||||
JOIN history h ON history_fts.rowid = h.id
|
|
||||||
WHERE history_fts MATCH :fts_query
|
|
||||||
"""
|
|
||||||
|
|
||||||
# Add date filters if provided
|
|
||||||
if start_date:
|
|
||||||
sql += " AND h.visit_time >= :start_date"
|
|
||||||
params['start_date'] = start_date
|
|
||||||
if end_date:
|
|
||||||
sql += " AND h.visit_time <= :end_date"
|
|
||||||
params['end_date'] = end_date
|
|
||||||
|
|
||||||
sql += " ORDER BY rank, h.visit_time DESC LIMIT 100"
|
|
||||||
|
|
||||||
params['fts_query'] = fts_query
|
|
||||||
|
|
||||||
results = db.execute(text(sql), params).fetchall()
|
|
||||||
return [serialize_history_entry(row, include_content) for row in results]
|
|
||||||
|
|
||||||
else:
|
|
||||||
# Handle non-search queries
|
|
||||||
query = db.query(HistoryEntry)
|
|
||||||
|
|
||||||
if domain:
|
|
||||||
query = query.filter(HistoryEntry.domain == domain)
|
|
||||||
if start_date:
|
|
||||||
query = query.filter(HistoryEntry.visit_time >= start_date)
|
|
||||||
if end_date:
|
|
||||||
query = query.filter(HistoryEntry.visit_time <= end_date)
|
|
||||||
|
|
||||||
entries = query.order_by(HistoryEntry.visit_time.desc()).limit(100).all()
|
|
||||||
return [serialize_history_entry(entry, include_content) for entry in entries]
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Search error: {str(e)}", exc_info=True)
|
|
||||||
raise HTTPException(
|
|
||||||
status_code=500,
|
|
||||||
detail={"message": "Search operation failed", "error": str(e)}
|
|
||||||
)
|
|
||||||
|
|
||||||
@app.get("/bookmarks/search")
|
|
||||||
async def search_bookmarks(
|
|
||||||
domain: Optional[str] = Query(None),
|
|
||||||
folder: Optional[str] = Query(None),
|
|
||||||
search_term: Optional[str] = Query(None),
|
|
||||||
db: Session = Depends(get_db)
|
|
||||||
):
|
|
||||||
"""Search bookmarks with optimized queries"""
|
|
||||||
try:
|
|
||||||
# Build query efficiently
|
|
||||||
query = db.query(Bookmark)
|
|
||||||
|
|
||||||
# Apply filters using index-optimized queries
|
|
||||||
if domain:
|
|
||||||
query = query.filter(Bookmark.domain == domain)
|
|
||||||
|
|
||||||
if folder:
|
|
||||||
query = query.filter(Bookmark.folder == folder)
|
|
||||||
|
|
||||||
if search_term:
|
|
||||||
# Use LIKE with index hint for title search
|
|
||||||
search_pattern = f"%{search_term}%"
|
|
||||||
query = query.filter(
|
|
||||||
Bookmark.title.ilike(search_pattern)
|
|
||||||
).with_hint(
|
|
||||||
Bookmark,
|
|
||||||
'INDEXED BY ix_bookmarks_title',
|
|
||||||
'sqlite'
|
|
||||||
)
|
|
||||||
|
|
||||||
# Add ordering and limit for better performance
|
|
||||||
bookmarks = query.order_by(Bookmark.added_time.desc()).limit(1000).all()
|
|
||||||
|
|
||||||
return [serialize_bookmark(bookmark) for bookmark in bookmarks]
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
print(f"Bookmark search error: {e}")
|
|
||||||
raise HTTPException(status_code=500, detail="Search operation failed")
|
|
||||||
|
|
||||||
# Add new endpoint for advanced full-text search
|
|
||||||
@app.get("/history/search/advanced")
|
|
||||||
async def advanced_history_search(
|
|
||||||
query: str = Query(..., description="Full-text search query with SQLite FTS5 syntax"),
|
|
||||||
include_content: bool = Query(False),
|
|
||||||
db: Session = Depends(get_db)
|
|
||||||
):
|
|
||||||
"""Advanced full-text search using SQLite FTS5 features"""
|
|
||||||
try:
|
|
||||||
# Use raw SQL for advanced FTS query
|
|
||||||
fts_query = """
|
|
||||||
SELECT h.*, rank
|
|
||||||
FROM history h
|
|
||||||
INNER JOIN history_fts f ON h.id = f.rowid
|
|
||||||
WHERE history_fts MATCH :query
|
|
||||||
ORDER BY rank
|
|
||||||
LIMIT 1000
|
|
||||||
"""
|
|
||||||
|
|
||||||
results = db.execute(text(fts_query), {'query': query}).all()
|
|
||||||
|
|
||||||
# Convert results to HistoryEntry objects
|
|
||||||
entries = [
|
|
||||||
serialize_history_entry(
|
|
||||||
HistoryEntry(
|
|
||||||
id=row.id,
|
|
||||||
url=row.url,
|
|
||||||
title=row.title,
|
|
||||||
visit_time=row.visit_time,
|
|
||||||
domain=row.domain,
|
|
||||||
markdown_content=row.markdown_content if include_content else None
|
|
||||||
),
|
|
||||||
include_content
|
|
||||||
)
|
|
||||||
for row in results
|
|
||||||
]
|
|
||||||
|
|
||||||
return entries
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
print(f"Advanced search error: {e}")
|
|
||||||
raise HTTPException(status_code=500, detail="Advanced search operation failed")
|
|
||||||
|
|
||||||
@app.websocket("/ws")
|
|
||||||
async def websocket_endpoint(websocket: WebSocket, db: Session = Depends(get_db)):
|
|
||||||
logger.info("New WebSocket connection established")
|
|
||||||
page_reader = PageReader()
|
|
||||||
await websocket.accept()
|
|
||||||
try:
|
|
||||||
while True:
|
|
||||||
data = await websocket.receive_json()
|
|
||||||
|
|
||||||
# Parse the URL and check if domain should be ignored
|
|
||||||
domain = urlparse(data['url']).netloc
|
|
||||||
if config.is_domain_ignored(domain):
|
|
||||||
logger.info(f"Ignoring domain: {domain}")
|
|
||||||
await websocket.send_json({
|
|
||||||
"status": "ignored",
|
|
||||||
"message": f"Domain {domain} is in ignore list"
|
|
||||||
})
|
|
||||||
continue
|
|
||||||
|
|
||||||
logger.info(f"Processing page: {data['url']}")
|
|
||||||
timestamp = iso8601.parse_date(data['timestamp'])
|
|
||||||
|
|
||||||
# Check if we already have a recent entry for this URL
|
|
||||||
existing_entry = db.query(HistoryEntry).filter(
|
|
||||||
HistoryEntry.url == data['url'],
|
|
||||||
HistoryEntry.visit_time >= timestamp - timedelta(minutes=5)
|
|
||||||
).first()
|
|
||||||
|
|
||||||
if existing_entry:
|
|
||||||
print(f"Recent entry exists for URL: {data['url']}")
|
|
||||||
await websocket.send_json({
|
|
||||||
"status": "skipped",
|
|
||||||
"message": "Recent entry exists"
|
|
||||||
})
|
|
||||||
continue
|
|
||||||
|
|
||||||
page_info = PageInfo(
|
|
||||||
url=data['url'],
|
|
||||||
html=data['html'],
|
|
||||||
timestamp=timestamp
|
|
||||||
)
|
|
||||||
|
|
||||||
# Debug HTML content
|
|
||||||
print(f"HTML content length before processing: {len(page_info.html)}")
|
|
||||||
|
|
||||||
# Extract title
|
|
||||||
soup = BeautifulSoup(page_info.html, 'html.parser')
|
|
||||||
title = soup.title.string if soup.title else ''
|
|
||||||
print(f"Extracted title: {title}")
|
|
||||||
|
|
||||||
# Debug markdown conversion
|
|
||||||
print("Starting markdown conversion...")
|
|
||||||
cleaned_html = page_reader.clean_html(page_info.html)
|
|
||||||
print(f"Cleaned HTML length: {len(cleaned_html)}")
|
|
||||||
|
|
||||||
markdown_content = page_reader.html_to_markdown(page_info.html)
|
|
||||||
print(f"Markdown conversion complete. Content length: {len(markdown_content) if markdown_content else 0}")
|
|
||||||
|
|
||||||
if markdown_content:
|
|
||||||
print("First 100 chars of markdown:", markdown_content[:100])
|
|
||||||
else:
|
|
||||||
print("No markdown content generated")
|
|
||||||
|
|
||||||
if not title and not markdown_content:
|
|
||||||
print(f"No content extracted from: {page_info.url}")
|
|
||||||
await websocket.send_json({
|
|
||||||
"status": "skipped",
|
|
||||||
"message": "No content extracted"
|
|
||||||
})
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Create history entry
|
|
||||||
history_entry = HistoryEntry(
|
|
||||||
url=page_info.url,
|
|
||||||
title=title,
|
|
||||||
visit_time=page_info.timestamp,
|
|
||||||
domain=domain,
|
|
||||||
markdown_content=markdown_content,
|
|
||||||
last_content_update=datetime.now(timezone.utc)
|
|
||||||
)
|
|
||||||
|
|
||||||
# Debug database operation
|
|
||||||
print(f"Saving entry with markdown length: {len(markdown_content) if markdown_content else 0}")
|
|
||||||
|
|
||||||
# Use bulk operations for better performance
|
|
||||||
db.add(history_entry)
|
|
||||||
|
|
||||||
try:
|
|
||||||
db.commit()
|
|
||||||
print(f"Successfully saved entry for: {page_info.url}")
|
|
||||||
print(f"Verify markdown content length in database: {len(history_entry.markdown_content) if history_entry.markdown_content else 0}")
|
|
||||||
await websocket.send_json({
|
|
||||||
"status": "success",
|
|
||||||
"message": f"Processed page: {page_info.url}"
|
|
||||||
})
|
|
||||||
except Exception as e:
|
|
||||||
db.rollback()
|
|
||||||
print(f"Error saving entry: {e}")
|
|
||||||
await websocket.send_json({
|
|
||||||
"status": "error",
|
|
||||||
"message": "Database error"
|
|
||||||
})
|
|
||||||
|
|
||||||
except WebSocketDisconnect:
|
|
||||||
logger.info("Client disconnected")
|
|
||||||
except Exception as e:
|
|
||||||
logger.error("Error in WebSocket handler", exc_info=True)
|
|
||||||
finally:
|
|
||||||
await page_reader.close()
|
|
||||||
|
|
||||||
@app.get("/config/ignored-domains")
|
|
||||||
async def get_ignored_domains():
|
|
||||||
"""Get list of ignored domain patterns"""
|
|
||||||
return {"ignored_domains": config.config.get('ignored_domains', [])}
|
|
||||||
|
|
||||||
@app.post("/config/ignored-domains")
|
|
||||||
async def add_ignored_domain(pattern: str):
|
|
||||||
"""Add a new domain pattern to ignored list"""
|
|
||||||
config.add_ignored_domain(pattern)
|
|
||||||
return {"status": "success", "message": f"Added pattern: {pattern}"}
|
|
||||||
|
|
||||||
@app.delete("/config/ignored-domains/{pattern}")
|
|
||||||
async def remove_ignored_domain(pattern: str):
|
|
||||||
"""Remove a domain pattern from ignored list"""
|
|
||||||
config.remove_ignored_domain(pattern)
|
|
||||||
return {"status": "success", "message": f"Removed pattern: {pattern}"}
|
|
||||||
|
|
||||||
@app.get("/")
|
|
||||||
async def home(request: Request, db: Session = Depends(get_db)):
|
|
||||||
# Get recent history entries
|
|
||||||
entries = db.query(HistoryEntry)\
|
|
||||||
.order_by(HistoryEntry.visit_time.desc())\
|
|
||||||
.limit(50)\
|
|
||||||
.all()
|
|
||||||
return templates.TemplateResponse(
|
|
||||||
"index.html",
|
|
||||||
{"request": request, "entries": entries}
|
|
||||||
)
|
|
||||||
|
|
||||||
@app.get("/search")
|
|
||||||
async def search_page(request: Request):
|
|
||||||
return templates.TemplateResponse(
|
|
||||||
"search.html",
|
|
||||||
{"request": request}
|
|
||||||
)
|
|
||||||
|
|
||||||
@app.get("/bookmarks")
|
|
||||||
async def bookmarks_page(request: Request, db: Session = Depends(get_db)):
|
|
||||||
bookmarks = db.query(Bookmark)\
|
|
||||||
.order_by(Bookmark.added_time.desc())\
|
|
||||||
.limit(50)\
|
|
||||||
.all()
|
|
||||||
return templates.TemplateResponse(
|
|
||||||
"bookmarks.html",
|
|
||||||
{"request": request, "bookmarks": bookmarks}
|
|
||||||
)
|
|
||||||
|
|
||||||
def process_browser_history():
|
def process_browser_history():
|
||||||
|
"""Fetches and stores new history entries from browser_history library (Initial Sync)."""
|
||||||
try:
|
try:
|
||||||
logger.info("Starting browser history processing")
|
logger.info("Starting browser history processing (initial sync)")
|
||||||
outputs = browser_history.get_history()
|
outputs = browser_history.get_history()
|
||||||
history_list = outputs.histories # This is a list of tuples (timestamp, url, title)
|
# browser_history returns platform specific History object, get histories list
|
||||||
logger.info(f"Found {len(history_list)} total history items")
|
history_list = []
|
||||||
|
if hasattr(outputs, 'histories') and outputs.histories:
|
||||||
|
history_list = outputs.histories # List of (datetime, url, title)
|
||||||
|
else:
|
||||||
|
logger.warning("Could not retrieve histories list from browser_history output.")
|
||||||
|
return # Exit if no history list found
|
||||||
|
|
||||||
current_timestamp = int(datetime.now().timestamp())
|
logger.info(f"Found {len(history_list)} total history items from browser_history library")
|
||||||
source_key = "browser_history" # Single source since we get combined history
|
|
||||||
last_timestamp = get_last_processed_timestamp(source_key)
|
|
||||||
|
|
||||||
logger.info(f"Last processed timestamp: {last_timestamp}")
|
current_timestamp_dt = datetime.now(timezone.utc)
|
||||||
|
current_timestamp = int(current_timestamp_dt.timestamp()) # Use timezone-aware timestamp
|
||||||
|
source_key = "browser_history_sync" # Differentiate from scheduler source
|
||||||
|
last_timestamp = get_last_processed_timestamp(source_key) or 0 # Ensure it's 0 if None
|
||||||
|
|
||||||
# Filter for only new entries
|
logger.info(f"Last processed timestamp for initial sync '{source_key}': {last_timestamp}")
|
||||||
new_entries = [
|
|
||||||
entry for entry in history_list
|
|
||||||
if entry[0].timestamp() > last_timestamp
|
|
||||||
]
|
|
||||||
|
|
||||||
logger.info(f"Found {len(new_entries)} new entries")
|
new_entries = []
|
||||||
|
processed_urls_times = set() # Avoid duplicates within the batch
|
||||||
|
|
||||||
|
for entry in history_list:
|
||||||
|
# Basic validation of entry structure
|
||||||
|
if not isinstance(entry, (tuple, list)) or len(entry) < 2:
|
||||||
|
logger.warning(f"Skipping malformed history entry: {entry}")
|
||||||
|
continue
|
||||||
|
timestamp, url = entry[0], entry[1]
|
||||||
|
title = entry[2] if len(entry) > 2 else "" # Handle optional title
|
||||||
|
|
||||||
|
if not url or not timestamp:
|
||||||
|
logger.warning(f"Skipping entry with missing URL or timestamp: Title='{title}'")
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Ensure timestamp is datetime object
|
||||||
|
if not isinstance(timestamp, datetime):
|
||||||
|
logger.warning(f"Skipping entry with non-datetime timestamp ({type(timestamp)}): {url}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Normalize timestamp (Assume local if naive, convert to UTC)
|
||||||
|
if timestamp.tzinfo is None or timestamp.tzinfo.utcoffset(timestamp) is None:
|
||||||
|
try:
|
||||||
|
timestamp_aware = timestamp.astimezone() # Make aware using system local
|
||||||
|
except Exception as tz_err:
|
||||||
|
logger.warning(f"Could not determine local timezone for naive timestamp {timestamp}. Assuming UTC. Error: {tz_err}")
|
||||||
|
timestamp_aware = timestamp.replace(tzinfo=timezone.utc) # Fallback to UTC
|
||||||
|
else:
|
||||||
|
timestamp_aware = timestamp
|
||||||
|
timestamp_utc = timestamp_aware.astimezone(timezone.utc)
|
||||||
|
|
||||||
|
|
||||||
|
# Filter for only new entries based on normalized UTC timestamp
|
||||||
|
if timestamp_utc.timestamp() > last_timestamp:
|
||||||
|
entry_key = (url, timestamp_utc.timestamp())
|
||||||
|
if entry_key in processed_urls_times:
|
||||||
|
continue # Skip duplicate within this batch
|
||||||
|
|
||||||
|
new_entries.append((timestamp_utc, url, title))
|
||||||
|
processed_urls_times.add(entry_key)
|
||||||
|
|
||||||
|
logger.info(f"Found {len(new_entries)} new entries for initial sync after filtering")
|
||||||
|
|
||||||
if new_entries:
|
if new_entries:
|
||||||
for timestamp, url, title in new_entries:
|
added_count = 0
|
||||||
logger.info(f"Processing entry: {timestamp} - {url}")
|
skipped_ignored = 0
|
||||||
|
# Use context manager for session
|
||||||
|
with next(get_db()) as db:
|
||||||
|
try:
|
||||||
|
for timestamp_utc, url, title in new_entries:
|
||||||
domain = urlparse(url).netloc
|
domain = urlparse(url).netloc
|
||||||
if config.is_domain_ignored(domain):
|
if config_manager.is_domain_ignored(domain):
|
||||||
logger.debug(f"Skipping ignored domain: {domain}")
|
# logger.debug(f"Skipping ignored domain during initial sync: {domain}")
|
||||||
|
skipped_ignored += 1
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Create history entry
|
# Optional: Check if entry already exists more robustly
|
||||||
db = next(get_db())
|
# existing = db.query(HistoryEntry.id).filter(HistoryEntry.url == url, HistoryEntry.visit_time == timestamp_utc).first()
|
||||||
try:
|
# if existing:
|
||||||
|
# continue
|
||||||
|
|
||||||
history_entry = HistoryEntry(
|
history_entry = HistoryEntry(
|
||||||
url=url,
|
url=url,
|
||||||
title=title,
|
title=title or "", # Ensure title is not None
|
||||||
visit_time=timestamp,
|
visit_time=timestamp_utc,
|
||||||
domain=domain
|
domain=domain
|
||||||
|
# Note: No markdown content here, only basic history
|
||||||
)
|
)
|
||||||
db.add(history_entry)
|
db.add(history_entry)
|
||||||
|
added_count += 1
|
||||||
|
|
||||||
|
if added_count > 0:
|
||||||
db.commit()
|
db.commit()
|
||||||
except Exception as e:
|
logger.info(f"Committed {added_count} new history entries from initial sync.")
|
||||||
logger.error(f"Error storing history item: {str(e)}")
|
# Update the last processed timestamp only if successful commit
|
||||||
db.rollback()
|
|
||||||
finally:
|
|
||||||
db.close()
|
|
||||||
|
|
||||||
# Update the last processed timestamp
|
|
||||||
update_last_processed_timestamp(source_key, current_timestamp)
|
update_last_processed_timestamp(source_key, current_timestamp)
|
||||||
logger.info(f"Updated timestamp to {current_timestamp}")
|
logger.info(f"Updated initial sync timestamp for '{source_key}' to {current_timestamp}")
|
||||||
|
else:
|
||||||
|
logger.info("No new unique entries to commit during initial sync.")
|
||||||
|
# Update timestamp even if nothing new added, to mark sync time
|
||||||
|
update_last_processed_timestamp(source_key, current_timestamp)
|
||||||
|
logger.info(f"Updated initial sync timestamp check for '{source_key}' to {current_timestamp}")
|
||||||
|
|
||||||
logger.info(f"Processed {len(new_entries)} new items")
|
|
||||||
|
if skipped_ignored > 0:
|
||||||
|
logger.info(f"Skipped {skipped_ignored} entries due to ignored domains during initial sync.")
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Error processing browser history: {str(e)}", exc_info=True)
|
logger.error(f"Error storing history item during initial sync: {str(e)}", exc_info=True)
|
||||||
|
db.rollback()
|
||||||
|
else:
|
||||||
|
logger.info("No new history entries found during initial sync.")
|
||||||
|
# Update timestamp even if nothing new found, to mark sync time
|
||||||
|
update_last_processed_timestamp(source_key, current_timestamp)
|
||||||
|
logger.info(f"Updated initial sync timestamp check for '{source_key}' to {current_timestamp}")
|
||||||
|
|
||||||
|
|
||||||
|
except ImportError:
|
||||||
|
logger.warning("`browser_history` library not found or import failed. Skipping initial sync.")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error processing browser history during initial sync: {str(e)}", exc_info=True)
|
||||||
|
|
||||||
|
|
||||||
|
# --- Startup and Shutdown Events ---
|
||||||
|
@app.on_event("startup")
|
||||||
|
async def startup_event():
|
||||||
|
global crawler, scheduler # Allow modification of globals
|
||||||
|
logger.info("Starting application initialization...")
|
||||||
|
|
||||||
|
try:
|
||||||
|
# 1. Ensure base tables exist
|
||||||
|
logger.info("Ensuring base tables exist...")
|
||||||
|
create_tables()
|
||||||
|
|
||||||
|
# 2. Initialize the crawler
|
||||||
|
logger.info("Initializing AsyncWebCrawler...")
|
||||||
|
if crawler is None:
|
||||||
|
crawler = AsyncWebCrawler()
|
||||||
|
logger.info("AsyncWebCrawler initialized.")
|
||||||
|
|
||||||
|
# 3. Initialize the Scheduler *after* the crawler
|
||||||
|
logger.info("Initializing HistoryScheduler...")
|
||||||
|
if scheduler is None:
|
||||||
|
scheduler = HistoryScheduler(crawler=crawler) # Pass crawler instance
|
||||||
|
logger.info("HistoryScheduler initialized.")
|
||||||
|
|
||||||
|
# 4. Perform initial history sync from browser_history library
|
||||||
|
logger.info("Performing initial browser history sync...")
|
||||||
|
process_browser_history() # Sync history not processed before
|
||||||
|
|
||||||
|
# 5. Perform initial bookmark sync (using scheduler's method)
|
||||||
|
# Run in background to avoid blocking startup if it takes long
|
||||||
|
logger.info("Starting initial bookmark sync task...")
|
||||||
|
asyncio.create_task(scheduler.update_bookmarks())
|
||||||
|
|
||||||
|
# 6. Start background tasks (scheduler for ongoing updates)
|
||||||
|
logger.info("Starting background history update task...")
|
||||||
|
asyncio.create_task(scheduler.update_history())
|
||||||
|
|
||||||
|
# --- Markdown Update Tasks ---
|
||||||
|
# 7a. Trigger ONE initial batch processing run in the background
|
||||||
|
logger.info("Starting initial markdown processing batch task...")
|
||||||
|
asyncio.create_task(scheduler._process_markdown_batch()) # Run one batch now
|
||||||
|
|
||||||
|
# 7b. Start the PERIODIC background markdown update task
|
||||||
|
logger.info("Starting periodic background markdown update task...")
|
||||||
|
# Use the renamed method for the loop
|
||||||
|
asyncio.create_task(scheduler.update_missing_markdown_periodically())
|
||||||
|
# --- End Markdown Update Tasks ---
|
||||||
|
|
||||||
|
|
||||||
|
logger.info("Application startup sequence initiated. Background tasks running.")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"FATAL ERROR during application startup: {str(e)}", exc_info=True)
|
||||||
|
raise RuntimeError(f"Application startup failed: {e}") from e
|
||||||
|
|
||||||
|
|
||||||
|
@app.on_event("shutdown")
|
||||||
|
async def shutdown_event():
|
||||||
|
global crawler, scheduler
|
||||||
|
logger.info("Starting application shutdown...")
|
||||||
|
|
||||||
|
# Stop scheduler tasks gracefully if possible (implement cancellation in tasks if needed)
|
||||||
|
# For now, we just close resources
|
||||||
|
|
||||||
|
# Close scheduler resources
|
||||||
|
if scheduler and hasattr(scheduler, 'close'):
|
||||||
|
try:
|
||||||
|
logger.info("Closing scheduler resources...")
|
||||||
|
await scheduler.close() # Call the scheduler's close method
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error closing scheduler: {e}", exc_info=True)
|
||||||
|
|
||||||
|
# Close crawler if needed (check crawl4ai docs for explicit close method)
|
||||||
|
# Based on previous code, seems no explicit close needed, but keep check just in case
|
||||||
|
if crawler and hasattr(crawler, 'aclose'):
|
||||||
|
try:
|
||||||
|
logger.info("Closing AsyncWebCrawler...")
|
||||||
|
# await crawler.aclose() # Example if an async close exists
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error closing crawler: {e}", exc_info=True)
|
||||||
|
|
||||||
|
|
||||||
|
# Close database engine connections if necessary (usually handled automatically by SQLAlchemy)
|
||||||
|
# if engine and hasattr(engine, 'dispose'): # Check if using async engine that needs dispose
|
||||||
|
# await engine.dispose()
|
||||||
|
|
||||||
|
logger.info("Application shutdown complete.")
|
||||||
|
|
||||||
|
|
||||||
|
# --- Include Routers ---
|
||||||
|
app.include_router(history.router)
|
||||||
|
app.include_router(bookmarks.router)
|
||||||
|
app.include_router(api_config.router)
|
||||||
|
app.include_router(websocket.router)
|
||||||
|
app.include_router(ui.router)
|
||||||
|
|
||||||
|
# Optional: Add a root endpoint for health check or basic info
|
||||||
|
@app.get("/health", tags=["service"])
|
||||||
|
async def health_check():
|
||||||
|
# Extended health check could verify DB connection or task status
|
||||||
|
db_ok = False
|
||||||
|
try:
|
||||||
|
with next(get_db()) as db:
|
||||||
|
db.execute("SELECT 1")
|
||||||
|
db_ok = True
|
||||||
|
except Exception:
|
||||||
|
db_ok = False
|
||||||
|
|
||||||
|
return {
|
||||||
|
"status": "ok",
|
||||||
|
"database_connection": "ok" if db_ok else "error",
|
||||||
|
# Add other checks as needed
|
||||||
|
}
|
||||||
@@ -1,117 +0,0 @@
|
|||||||
import re
|
|
||||||
from markdownify import markdownify as md
|
|
||||||
from bs4 import BeautifulSoup
|
|
||||||
from typing import Optional
|
|
||||||
from urllib.parse import urlparse
|
|
||||||
from .config import ReaderConfig
|
|
||||||
from .logging_config import setup_logger
|
|
||||||
from .database import SessionLocal
|
|
||||||
|
|
||||||
# Setup logger for this module
|
|
||||||
logger = setup_logger(__name__)
|
|
||||||
|
|
||||||
# Patterns for cleaning
|
|
||||||
SCRIPT_PATTERN = r"<[ ]*script.*?\/[ ]*script[ ]*>"
|
|
||||||
STYLE_PATTERN = r"<[ ]*style.*?\/[ ]*style[ ]*>"
|
|
||||||
META_PATTERN = r"<[ ]*meta.*?>"
|
|
||||||
COMMENT_PATTERN = r"<[ ]*!--.*?--[ ]*>"
|
|
||||||
LINK_PATTERN = r"<[ ]*link.*?>"
|
|
||||||
BASE64_IMG_PATTERN = r'<img[^>]+src="data:image/[^;]+;base64,[^"]+"[^>]*>'
|
|
||||||
SVG_PATTERN = r"(<svg[^>]*>)(.*?)(<\/svg>)"
|
|
||||||
|
|
||||||
class PageReader:
|
|
||||||
def __init__(self):
|
|
||||||
self.config = ReaderConfig()
|
|
||||||
logger.info("PageReader initialized")
|
|
||||||
|
|
||||||
def clean_html(self, html: str) -> str:
|
|
||||||
"""Clean HTML by removing unwanted elements and patterns."""
|
|
||||||
if not html:
|
|
||||||
logger.warning("Received empty HTML to clean")
|
|
||||||
return ""
|
|
||||||
|
|
||||||
logger.debug(f"Cleaning HTML of length: {len(html)}")
|
|
||||||
# First use regex to remove problematic patterns
|
|
||||||
html = re.sub(SCRIPT_PATTERN, "", html, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL)
|
|
||||||
html = re.sub(STYLE_PATTERN, "", html, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL)
|
|
||||||
html = re.sub(META_PATTERN, "", html, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL)
|
|
||||||
html = re.sub(COMMENT_PATTERN, "", html, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL)
|
|
||||||
html = re.sub(LINK_PATTERN, "", html, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL)
|
|
||||||
html = re.sub(SVG_PATTERN, "", html, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL)
|
|
||||||
html = re.sub(BASE64_IMG_PATTERN, "", html)
|
|
||||||
|
|
||||||
try:
|
|
||||||
# Use BeautifulSoup to remove additional elements we want to strip
|
|
||||||
soup = BeautifulSoup(html, 'html.parser')
|
|
||||||
|
|
||||||
# Remove unwanted elements
|
|
||||||
elements_to_remove = [
|
|
||||||
'canvas', 'img', 'picture', 'audio', 'video',
|
|
||||||
'iframe', 'embed', 'object', 'param', 'track',
|
|
||||||
'map', 'area', 'source'
|
|
||||||
]
|
|
||||||
|
|
||||||
for element in elements_to_remove:
|
|
||||||
removed = len(soup.find_all(element))
|
|
||||||
if removed:
|
|
||||||
logger.debug(f"Removed {removed} {element} elements")
|
|
||||||
for tag in soup.find_all(element):
|
|
||||||
tag.decompose()
|
|
||||||
|
|
||||||
return str(soup)
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Error cleaning HTML: {e}", exc_info=True)
|
|
||||||
return ""
|
|
||||||
|
|
||||||
def clean_whitespace(self, text: str) -> str:
|
|
||||||
"""Clean excessive whitespace from text."""
|
|
||||||
if not text:
|
|
||||||
return ""
|
|
||||||
|
|
||||||
try:
|
|
||||||
# Replace 3 or more newlines with 2 newlines
|
|
||||||
cleaned = re.sub(r'\n{3,}', '\n\n', text)
|
|
||||||
# Remove trailing whitespace from each line
|
|
||||||
cleaned = '\n'.join(line.rstrip() for line in cleaned.splitlines())
|
|
||||||
return cleaned.strip()
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Error cleaning whitespace: {e}")
|
|
||||||
return ""
|
|
||||||
|
|
||||||
def html_to_markdown(self, html: str) -> Optional[str]:
|
|
||||||
"""Convert HTML to markdown."""
|
|
||||||
try:
|
|
||||||
logger.info("Starting HTML to Markdown conversion")
|
|
||||||
logger.debug(f"Input HTML length: {len(html)}")
|
|
||||||
|
|
||||||
cleaned_html = self.clean_html(html)
|
|
||||||
logger.debug(f"Cleaned HTML length: {len(cleaned_html)}")
|
|
||||||
|
|
||||||
if not cleaned_html:
|
|
||||||
logger.warning("No cleaned HTML content")
|
|
||||||
return None
|
|
||||||
|
|
||||||
markdown = self.clean_whitespace(md(cleaned_html,
|
|
||||||
heading_style="ATX",
|
|
||||||
bullets="-",
|
|
||||||
autolinks=True,
|
|
||||||
strip=['form'],
|
|
||||||
escape_asterisks=True,
|
|
||||||
escape_underscores=True))
|
|
||||||
|
|
||||||
logger.debug(f"Generated markdown length: {len(markdown) if markdown else 0}")
|
|
||||||
|
|
||||||
if not markdown or markdown.isspace():
|
|
||||||
logger.warning("Markdown is empty or whitespace only")
|
|
||||||
return None
|
|
||||||
|
|
||||||
return markdown
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logger.error("Error converting to markdown", exc_info=True)
|
|
||||||
return None
|
|
||||||
|
|
||||||
async def close(self):
|
|
||||||
"""Cleanup resources"""
|
|
||||||
logger.info("Closing PageReader")
|
|
||||||
pass # No need to close DB connection anymore
|
|
||||||
47
app/routers/bookmarks.py
Normal file
47
app/routers/bookmarks.py
Normal file
@@ -0,0 +1,47 @@
|
|||||||
|
from fastapi import APIRouter, Depends, Query, HTTPException
|
||||||
|
from sqlalchemy.orm import Session
|
||||||
|
from typing import List, Optional
|
||||||
|
|
||||||
|
from ..database import get_db, Bookmark
|
||||||
|
from ..utils import serialize_bookmark
|
||||||
|
from ..logging_config import setup_logger
|
||||||
|
|
||||||
|
logger = setup_logger(__name__)
|
||||||
|
router = APIRouter(prefix="/bookmarks", tags=["bookmarks"])
|
||||||
|
|
||||||
|
@router.get("/search")
|
||||||
|
async def search_bookmarks(
|
||||||
|
domain: Optional[str] = Query(None),
|
||||||
|
folder: Optional[str] = Query(None),
|
||||||
|
search_term: Optional[str] = Query(None),
|
||||||
|
db: Session = Depends(get_db)
|
||||||
|
):
|
||||||
|
"""Search bookmarks with optimized queries"""
|
||||||
|
try:
|
||||||
|
# Build query efficiently
|
||||||
|
query = db.query(Bookmark)
|
||||||
|
|
||||||
|
# Apply filters using index-optimized queries
|
||||||
|
if domain:
|
||||||
|
query = query.filter(Bookmark.domain == domain)
|
||||||
|
|
||||||
|
if folder:
|
||||||
|
query = query.filter(Bookmark.folder == folder)
|
||||||
|
|
||||||
|
if search_term:
|
||||||
|
# Use LIKE for title search (consider FTS for bookmarks if needed)
|
||||||
|
search_pattern = f"%{search_term}%"
|
||||||
|
query = query.filter(Bookmark.title.ilike(search_pattern))
|
||||||
|
# Removed index hint as SQLAlchemy/SQLite usually handles this well with LIKE
|
||||||
|
|
||||||
|
# Add ordering and limit for better performance
|
||||||
|
bookmarks = query.order_by(Bookmark.added_time.desc()).limit(1000).all()
|
||||||
|
|
||||||
|
return [serialize_bookmark(bookmark) for bookmark in bookmarks]
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Bookmark search error: {e}", exc_info=True)
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=500,
|
||||||
|
detail={"message": "Bookmark search operation failed", "error": str(e)}
|
||||||
|
)
|
||||||
43
app/routers/config.py
Normal file
43
app/routers/config.py
Normal file
@@ -0,0 +1,43 @@
|
|||||||
|
from fastapi import APIRouter, Depends, HTTPException
|
||||||
|
from typing import List
|
||||||
|
|
||||||
|
from ..config import Config
|
||||||
|
from ..logging_config import setup_logger
|
||||||
|
|
||||||
|
logger = setup_logger(__name__)
|
||||||
|
router = APIRouter(prefix="/config", tags=["config"])
|
||||||
|
|
||||||
|
# Assuming config is a singleton or easily accessible
|
||||||
|
# If not, you might need to use Depends or app state
|
||||||
|
config = Config()
|
||||||
|
|
||||||
|
@router.get("/ignored-domains")
|
||||||
|
async def get_ignored_domains():
|
||||||
|
"""Get list of ignored domain patterns"""
|
||||||
|
try:
|
||||||
|
return {"ignored_domains": config.config.get('ignored_domains', [])}
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error getting ignored domains: {e}", exc_info=True)
|
||||||
|
raise HTTPException(status_code=500, detail="Failed to retrieve ignored domains")
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/ignored-domains")
|
||||||
|
async def add_ignored_domain(pattern: str):
|
||||||
|
"""Add a new domain pattern to ignored list"""
|
||||||
|
try:
|
||||||
|
config.add_ignored_domain(pattern)
|
||||||
|
return {"status": "success", "message": f"Added pattern: {pattern}"}
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error adding ignored domain '{pattern}': {e}", exc_info=True)
|
||||||
|
raise HTTPException(status_code=500, detail="Failed to add ignored domain")
|
||||||
|
|
||||||
|
|
||||||
|
@router.delete("/ignored-domains/{pattern}")
|
||||||
|
async def remove_ignored_domain(pattern: str):
|
||||||
|
"""Remove a domain pattern from ignored list"""
|
||||||
|
try:
|
||||||
|
config.remove_ignored_domain(pattern)
|
||||||
|
return {"status": "success", "message": f"Removed pattern: {pattern}"}
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error removing ignored domain '{pattern}': {e}", exc_info=True)
|
||||||
|
raise HTTPException(status_code=500, detail="Failed to remove ignored domain")
|
||||||
132
app/routers/history.py
Normal file
132
app/routers/history.py
Normal file
@@ -0,0 +1,132 @@
|
|||||||
|
from fastapi import APIRouter, Depends, Query, HTTPException
|
||||||
|
from sqlalchemy.orm import Session
|
||||||
|
from sqlalchemy import text
|
||||||
|
from typing import List, Optional
|
||||||
|
|
||||||
|
from ..database import get_db, HistoryEntry
|
||||||
|
from ..utils import serialize_history_entry
|
||||||
|
from ..logging_config import setup_logger
|
||||||
|
|
||||||
|
logger = setup_logger(__name__)
|
||||||
|
router = APIRouter(prefix="/history", tags=["history"])
|
||||||
|
|
||||||
|
@router.get("/search")
|
||||||
|
async def search_history(
|
||||||
|
query: Optional[str] = Query(None),
|
||||||
|
domain: Optional[str] = Query(None),
|
||||||
|
start_date: Optional[str] = Query(None),
|
||||||
|
end_date: Optional[str] = Query(None),
|
||||||
|
include_content: bool = Query(False),
|
||||||
|
db: Session = Depends(get_db)
|
||||||
|
):
|
||||||
|
"""Search history using FTS5"""
|
||||||
|
try:
|
||||||
|
if query:
|
||||||
|
# Build the FTS query
|
||||||
|
# Basic query sanitization/escaping might be needed depending on FTS syntax usage
|
||||||
|
# For simple term search, this is okay. For complex FTS syntax, more care is needed.
|
||||||
|
fts_conditions = []
|
||||||
|
params = {}
|
||||||
|
|
||||||
|
# Handle different query parts (title, content, domain)
|
||||||
|
# Example: "term1 title:term2 domain:example.com"
|
||||||
|
# This requires more sophisticated parsing. For now, assume simple query applies to title/content.
|
||||||
|
# A safer approach for user input:
|
||||||
|
sanitized_query = query.replace('"', '""') # Basic FTS escaping for quotes
|
||||||
|
fts_match_expr = f'(title : "{sanitized_query}"* OR markdown_content : "{sanitized_query}"*)'
|
||||||
|
params['fts_query'] = fts_match_expr
|
||||||
|
|
||||||
|
if domain:
|
||||||
|
# Add domain filtering directly in FTS if possible and indexed
|
||||||
|
# Assuming 'domain' is an indexed column in FTS table
|
||||||
|
# params['fts_query'] += f' AND domain : "{domain}"' # Adjust FTS syntax if needed
|
||||||
|
# Or filter after FTS search if domain isn't in FTS index efficiently
|
||||||
|
pass # Domain filtering will be added later if needed
|
||||||
|
|
||||||
|
# Build the SQL query
|
||||||
|
sql = """
|
||||||
|
SELECT
|
||||||
|
h.*,
|
||||||
|
bm25(history_fts) as rank,
|
||||||
|
highlight(history_fts, 0, '<mark>', '</mark>') as title_highlight,
|
||||||
|
highlight(history_fts, 1, '<mark>', '</mark>') as content_highlight
|
||||||
|
FROM history_fts
|
||||||
|
JOIN history h ON history_fts.rowid = h.id
|
||||||
|
WHERE history_fts MATCH :fts_query
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Add domain filter as a regular WHERE clause if not in FTS MATCH
|
||||||
|
if domain:
|
||||||
|
sql += " AND h.domain = :domain"
|
||||||
|
params['domain'] = domain
|
||||||
|
|
||||||
|
# Add date filters if provided
|
||||||
|
if start_date:
|
||||||
|
sql += " AND h.visit_time >= :start_date"
|
||||||
|
params['start_date'] = start_date
|
||||||
|
if end_date:
|
||||||
|
sql += " AND h.visit_time <= :end_date"
|
||||||
|
params['end_date'] = end_date
|
||||||
|
|
||||||
|
sql += " ORDER BY rank DESC, h.visit_time DESC LIMIT 100" # Rank usually descends
|
||||||
|
|
||||||
|
results = db.execute(text(sql), params).fetchall()
|
||||||
|
# Use the updated serializer that handles potential highlight/rank fields
|
||||||
|
return [serialize_history_entry(row, include_content) for row in results]
|
||||||
|
|
||||||
|
else:
|
||||||
|
# Handle non-search queries (basic filtering)
|
||||||
|
query_builder = db.query(HistoryEntry)
|
||||||
|
|
||||||
|
if domain:
|
||||||
|
query_builder = query_builder.filter(HistoryEntry.domain == domain)
|
||||||
|
if start_date:
|
||||||
|
query_builder = query_builder.filter(HistoryEntry.visit_time >= start_date)
|
||||||
|
if end_date:
|
||||||
|
query_builder = query_builder.filter(HistoryEntry.visit_time <= end_date)
|
||||||
|
|
||||||
|
entries = query_builder.order_by(HistoryEntry.visit_time.desc()).limit(100).all()
|
||||||
|
return [serialize_history_entry(entry, include_content) for entry in entries]
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Search error: {str(e)}", exc_info=True)
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=500,
|
||||||
|
detail={"message": "Search operation failed", "error": str(e)}
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/search/advanced")
|
||||||
|
async def advanced_history_search(
|
||||||
|
query: str = Query(..., description="Full-text search query with SQLite FTS5 syntax"),
|
||||||
|
include_content: bool = Query(False),
|
||||||
|
db: Session = Depends(get_db)
|
||||||
|
):
|
||||||
|
"""Advanced full-text search using SQLite FTS5 features"""
|
||||||
|
try:
|
||||||
|
# Use raw SQL for advanced FTS query
|
||||||
|
# Add rank and highlights here as well
|
||||||
|
fts_query = """
|
||||||
|
SELECT
|
||||||
|
h.*,
|
||||||
|
bm25(history_fts) as rank,
|
||||||
|
highlight(history_fts, 0, '<mark>', '</mark>') as title_highlight,
|
||||||
|
highlight(history_fts, 1, '<mark>', '</mark>') as content_highlight
|
||||||
|
FROM history_fts
|
||||||
|
JOIN history h ON history_fts.rowid = h.id
|
||||||
|
WHERE history_fts MATCH :query
|
||||||
|
ORDER BY rank DESC, h.visit_time DESC
|
||||||
|
LIMIT 1000
|
||||||
|
"""
|
||||||
|
|
||||||
|
results = db.execute(text(fts_query), {'query': query}).fetchall()
|
||||||
|
|
||||||
|
# Use the updated serializer
|
||||||
|
return [serialize_history_entry(row, include_content) for row in results]
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Advanced search error: {e}", exc_info=True)
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=500,
|
||||||
|
detail={"message": "Advanced search operation failed", "error": str(e)}
|
||||||
|
)
|
||||||
52
app/routers/ui.py
Normal file
52
app/routers/ui.py
Normal file
@@ -0,0 +1,52 @@
|
|||||||
|
from fastapi import APIRouter, Depends, Request
|
||||||
|
from fastapi.templating import Jinja2Templates
|
||||||
|
from sqlalchemy.orm import Session
|
||||||
|
|
||||||
|
from ..database import get_db, HistoryEntry, Bookmark
|
||||||
|
from ..logging_config import setup_logger
|
||||||
|
|
||||||
|
logger = setup_logger(__name__)
|
||||||
|
router = APIRouter(tags=["ui"])
|
||||||
|
templates = Jinja2Templates(directory="app/templates")
|
||||||
|
|
||||||
|
@router.get("/")
|
||||||
|
async def home(request: Request, db: Session = Depends(get_db)):
|
||||||
|
try:
|
||||||
|
# Get recent history entries
|
||||||
|
entries = db.query(HistoryEntry)\
|
||||||
|
.order_by(HistoryEntry.visit_time.desc())\
|
||||||
|
.limit(50)\
|
||||||
|
.all()
|
||||||
|
return templates.TemplateResponse(
|
||||||
|
"index.html",
|
||||||
|
{"request": request, "entries": entries}
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error loading home page: {e}", exc_info=True)
|
||||||
|
# Optionally return an error template
|
||||||
|
return templates.TemplateResponse("error.html", {"request": request, "detail": "Could not load history"})
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/search")
|
||||||
|
async def search_page(request: Request):
|
||||||
|
return templates.TemplateResponse(
|
||||||
|
"search.html",
|
||||||
|
{"request": request}
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/bookmarks")
|
||||||
|
async def bookmarks_page(request: Request, db: Session = Depends(get_db)):
|
||||||
|
try:
|
||||||
|
bookmarks = db.query(Bookmark)\
|
||||||
|
.order_by(Bookmark.added_time.desc())\
|
||||||
|
.limit(50)\
|
||||||
|
.all()
|
||||||
|
return templates.TemplateResponse(
|
||||||
|
"bookmarks.html",
|
||||||
|
{"request": request, "bookmarks": bookmarks}
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error loading bookmarks page: {e}", exc_info=True)
|
||||||
|
# Optionally return an error template
|
||||||
|
return templates.TemplateResponse("error.html", {"request": request, "detail": "Could not load bookmarks"})
|
||||||
175
app/routers/websocket.py
Normal file
175
app/routers/websocket.py
Normal file
@@ -0,0 +1,175 @@
|
|||||||
|
import asyncio
|
||||||
|
from fastapi import APIRouter, WebSocket, WebSocketDisconnect, Depends, HTTPException
|
||||||
|
from sqlalchemy.orm import Session
|
||||||
|
from datetime import datetime, timezone, timedelta
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
import iso8601
|
||||||
|
|
||||||
|
# Import necessary components from other modules
|
||||||
|
from .. import main as app_main # To access global crawler instance
|
||||||
|
from ..database import get_db, HistoryEntry
|
||||||
|
from ..config import Config
|
||||||
|
from ..logging_config import setup_logger
|
||||||
|
|
||||||
|
logger = setup_logger(__name__)
|
||||||
|
router = APIRouter(tags=["websocket"])
|
||||||
|
config = Config() # Assuming config is okay as a separate instance here
|
||||||
|
|
||||||
|
@router.websocket("/ws")
|
||||||
|
async def websocket_endpoint(websocket: WebSocket, db: Session = Depends(get_db)):
|
||||||
|
# Access the global crawler instance from main.py
|
||||||
|
crawler = app_main.crawler
|
||||||
|
if not crawler:
|
||||||
|
logger.error("Crawler not initialized!")
|
||||||
|
await websocket.close(code=1011) # Internal Server Error
|
||||||
|
return
|
||||||
|
|
||||||
|
logger.info("New WebSocket connection established")
|
||||||
|
await websocket.accept()
|
||||||
|
try:
|
||||||
|
while True:
|
||||||
|
data = await websocket.receive_json()
|
||||||
|
|
||||||
|
# Validate incoming data structure (basic check)
|
||||||
|
if 'url' not in data or 'timestamp' not in data:
|
||||||
|
logger.warning("Received invalid WebSocket message format.")
|
||||||
|
await websocket.send_json({
|
||||||
|
"status": "error",
|
||||||
|
"message": "Invalid message format. 'url' and 'timestamp' required."
|
||||||
|
})
|
||||||
|
continue
|
||||||
|
|
||||||
|
url = data['url']
|
||||||
|
try:
|
||||||
|
timestamp = iso8601.parse_date(data['timestamp'])
|
||||||
|
except iso8601.ParseError:
|
||||||
|
logger.warning(f"Received invalid timestamp format: {data['timestamp']}")
|
||||||
|
await websocket.send_json({
|
||||||
|
"status": "error",
|
||||||
|
"message": f"Invalid timestamp format: {data['timestamp']}"
|
||||||
|
})
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Parse the URL and check if domain should be ignored
|
||||||
|
try:
|
||||||
|
domain = urlparse(url).netloc
|
||||||
|
if not domain: # Handle invalid URLs
|
||||||
|
raise ValueError("Could not parse domain from URL")
|
||||||
|
except ValueError as e:
|
||||||
|
logger.warning(f"Could not parse URL: {url}. Error: {e}")
|
||||||
|
await websocket.send_json({"status": "error", "message": f"Invalid URL: {url}"})
|
||||||
|
continue
|
||||||
|
|
||||||
|
if config.is_domain_ignored(domain):
|
||||||
|
logger.info(f"Ignoring domain: {domain} for URL: {url}")
|
||||||
|
await websocket.send_json({
|
||||||
|
"status": "ignored",
|
||||||
|
"message": f"Domain {domain} is in ignore list"
|
||||||
|
})
|
||||||
|
continue
|
||||||
|
|
||||||
|
logger.info(f"Processing page via WebSocket: {url}")
|
||||||
|
|
||||||
|
# Check if we already have a recent entry for this URL
|
||||||
|
# Make timestamp timezone-aware (assuming UTC if naive)
|
||||||
|
if timestamp.tzinfo is None:
|
||||||
|
timestamp = timestamp.replace(tzinfo=timezone.utc)
|
||||||
|
else:
|
||||||
|
timestamp = timestamp.astimezone(timezone.utc)
|
||||||
|
|
||||||
|
recent_threshold = timestamp - timedelta(minutes=5)
|
||||||
|
existing_entry = db.query(HistoryEntry.id).filter(
|
||||||
|
HistoryEntry.url == url,
|
||||||
|
HistoryEntry.visit_time >= recent_threshold
|
||||||
|
).first() # Only fetch ID for efficiency
|
||||||
|
|
||||||
|
if existing_entry:
|
||||||
|
logger.info(f"Recent entry exists for URL: {url}")
|
||||||
|
await websocket.send_json({
|
||||||
|
"status": "skipped",
|
||||||
|
"message": "Recent entry exists"
|
||||||
|
})
|
||||||
|
continue
|
||||||
|
|
||||||
|
# --- Start crawl4ai processing ---
|
||||||
|
logger.info(f"Processing page with crawl4ai: {url}")
|
||||||
|
markdown_content = None
|
||||||
|
title = ''
|
||||||
|
try:
|
||||||
|
# Use the global crawler instance
|
||||||
|
crawl_result = await crawler.arun(url=url)
|
||||||
|
if crawl_result:
|
||||||
|
markdown_content = crawl_result.markdown
|
||||||
|
# Attempt to get title from metadata, fallback to empty string
|
||||||
|
title = getattr(crawl_result.metadata, 'title', '') or '' # Ensure title is string
|
||||||
|
if not title:
|
||||||
|
logger.warning(f"Could not extract title for {url} using crawl4ai.")
|
||||||
|
logger.info(f"crawl4ai processing complete. Markdown length: {len(markdown_content) if markdown_content else 0}, Title: '{title}'")
|
||||||
|
else:
|
||||||
|
logger.warning(f"crawl4ai returned None for URL: {url}")
|
||||||
|
markdown_content = "" # Ensure it's not None
|
||||||
|
title = ""
|
||||||
|
|
||||||
|
except Exception as crawl_error:
|
||||||
|
logger.error(f"crawl4ai failed for URL {url}: {crawl_error}", exc_info=True)
|
||||||
|
await websocket.send_json({
|
||||||
|
"status": "error",
|
||||||
|
"message": f"Failed to crawl page content: {str(crawl_error)}"
|
||||||
|
})
|
||||||
|
continue # Skip to next message
|
||||||
|
# --- End crawl4ai processing ---
|
||||||
|
|
||||||
|
# Only proceed if we got some content or at least a title
|
||||||
|
if not title and not markdown_content:
|
||||||
|
logger.info(f"No title or content extracted by crawl4ai from: {url}")
|
||||||
|
await websocket.send_json({
|
||||||
|
"status": "skipped",
|
||||||
|
"message": "No title or content extracted by crawl4ai"
|
||||||
|
})
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Create history entry using data from crawl4ai
|
||||||
|
history_entry = HistoryEntry(
|
||||||
|
url=url,
|
||||||
|
title=title, # Use title from crawl4ai
|
||||||
|
visit_time=timestamp, # Use the parsed, timezone-aware timestamp
|
||||||
|
domain=domain,
|
||||||
|
markdown_content=markdown_content, # Use markdown from crawl4ai
|
||||||
|
last_content_update=datetime.now(timezone.utc)
|
||||||
|
)
|
||||||
|
|
||||||
|
logger.debug(f"Attempting to save entry for {url} with markdown length: {len(markdown_content) if markdown_content else 0}")
|
||||||
|
|
||||||
|
db.add(history_entry)
|
||||||
|
try:
|
||||||
|
db.commit()
|
||||||
|
logger.info(f"Successfully saved entry for: {url}")
|
||||||
|
await websocket.send_json({
|
||||||
|
"status": "success",
|
||||||
|
"message": f"Processed page: {url}"
|
||||||
|
})
|
||||||
|
except Exception as e:
|
||||||
|
db.rollback()
|
||||||
|
logger.error(f"Error saving entry for {url}: {e}", exc_info=True)
|
||||||
|
await websocket.send_json({
|
||||||
|
"status": "error",
|
||||||
|
"message": "Database error occurred while saving."
|
||||||
|
})
|
||||||
|
|
||||||
|
except WebSocketDisconnect:
|
||||||
|
logger.info("WebSocket client disconnected")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Unhandled error in WebSocket handler: {e}", exc_info=True)
|
||||||
|
# Attempt to inform client before closing (might fail if connection is already broken)
|
||||||
|
try:
|
||||||
|
await websocket.send_json({
|
||||||
|
"status": "error",
|
||||||
|
"message": "An internal server error occurred."
|
||||||
|
})
|
||||||
|
except Exception:
|
||||||
|
pass # Ignore if sending fails
|
||||||
|
# Ensure connection is closed on server error
|
||||||
|
try:
|
||||||
|
await websocket.close(code=1011) # Internal Server Error
|
||||||
|
except Exception:
|
||||||
|
pass # Ignore if closing fails
|
||||||
374
app/scheduler.py
374
app/scheduler.py
@@ -1,142 +1,386 @@
|
|||||||
from fastapi import BackgroundTasks
|
from datetime import datetime, timedelta, timezone
|
||||||
from datetime import datetime, timedelta
|
|
||||||
import asyncio
|
import asyncio
|
||||||
from .database import SessionLocal, HistoryEntry, Bookmark, get_last_processed_timestamp, update_last_processed_timestamp
|
from sqlalchemy import or_, update
|
||||||
|
from .database import HistoryEntry, Bookmark, get_last_processed_timestamp, update_last_processed_timestamp
|
||||||
from .browser import BrowserHistoryCollector
|
from .browser import BrowserHistoryCollector
|
||||||
from .page_reader import PageReader
|
|
||||||
from sqlalchemy import func
|
|
||||||
from sqlalchemy.orm import Session
|
|
||||||
import pytz
|
|
||||||
from .config import Config
|
from .config import Config
|
||||||
from .database import get_db
|
from .database import get_db
|
||||||
from urllib.parse import urlparse
|
import urllib.parse
|
||||||
import logging
|
import logging
|
||||||
|
from crawl4ai import AsyncWebCrawler
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
class HistoryScheduler:
|
class HistoryScheduler:
|
||||||
def __init__(self):
|
def __init__(self, crawler: AsyncWebCrawler):
|
||||||
self.browser_collector = BrowserHistoryCollector()
|
self.browser_collector = BrowserHistoryCollector()
|
||||||
self.page_reader = PageReader()
|
|
||||||
self.last_history_update = None
|
self.last_history_update = None
|
||||||
self.content_update_interval = timedelta(hours=24) # Update content daily
|
self.content_update_interval = timedelta(hours=24) # Update content daily
|
||||||
self.config = Config()
|
self.config = Config()
|
||||||
self.db_lock = asyncio.Lock()
|
self.db_lock = asyncio.Lock()
|
||||||
|
self.crawler = crawler
|
||||||
|
|
||||||
def _normalize_datetime(self, dt: datetime) -> datetime:
|
def _normalize_datetime(self, dt: datetime) -> Optional[datetime]:
|
||||||
"""Convert datetime to UTC if it has timezone, or make it timezone-aware if it doesn't"""
|
"""Convert datetime to UTC if it has timezone, or make it timezone-aware (UTC) if it doesn't"""
|
||||||
if dt is None:
|
if dt is None:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
# If datetime is naive (no timezone), assume it's in UTC
|
# If datetime is naive (no timezone), assume it's local and convert to UTC
|
||||||
if dt.tzinfo is None:
|
if dt.tzinfo is None or dt.tzinfo.utcoffset(dt) is None:
|
||||||
return pytz.UTC.localize(dt)
|
# Assume local timezone if naive, then convert to UTC
|
||||||
|
# This might need adjustment based on where the naive datetime originates
|
||||||
|
# If browser_history always returns naive UTC, use: dt.replace(tzinfo=timezone.utc)
|
||||||
|
# If browser_history returns naive local time:
|
||||||
|
dt = dt.astimezone() # Make timezone-aware using system's local timezone
|
||||||
|
return dt.astimezone(timezone.utc) # Convert to UTC
|
||||||
|
|
||||||
# If datetime has timezone, convert to UTC
|
# If datetime already has timezone, convert to UTC
|
||||||
return dt.astimezone(pytz.UTC)
|
return dt.astimezone(timezone.utc)
|
||||||
|
|
||||||
async def update_bookmarks(self):
|
async def update_bookmarks(self):
|
||||||
"""Update bookmarks from browsers"""
|
"""Update bookmarks from browsers"""
|
||||||
try:
|
try:
|
||||||
current_timestamp = int(datetime.now().timestamp())
|
# Use timezone-aware current time
|
||||||
|
current_timestamp_dt = datetime.now(timezone.utc)
|
||||||
|
current_timestamp = int(current_timestamp_dt.timestamp())
|
||||||
source_key = "browser_bookmarks"
|
source_key = "browser_bookmarks"
|
||||||
last_timestamp = get_last_processed_timestamp(source_key)
|
# Ensure last_timestamp is 0 if None
|
||||||
|
last_timestamp = get_last_processed_timestamp(source_key) or 0
|
||||||
|
|
||||||
logger.info(f"Fetching bookmarks. Last processed timestamp: {last_timestamp}")
|
logger.info(f"Fetching bookmarks. Last processed timestamp (UTC epoch): {last_timestamp}")
|
||||||
bookmarks = self.browser_collector.fetch_bookmarks()
|
bookmarks = self.browser_collector.fetch_bookmarks()
|
||||||
logger.info(f"Found {len(bookmarks)} total bookmarks")
|
logger.info(f"Found {len(bookmarks)} total bookmarks")
|
||||||
|
|
||||||
# Filter for only new bookmarks
|
new_bookmarks = []
|
||||||
new_bookmarks = [
|
skipped_ignored = 0
|
||||||
(added_time, url, title, folder) for added_time, url, title, folder in bookmarks
|
processed_urls = set() # Avoid processing duplicate bookmark URLs within the same batch
|
||||||
if self._normalize_datetime(added_time).timestamp() > last_timestamp
|
|
||||||
]
|
for added_time, url, title, folder in bookmarks:
|
||||||
|
if not url or url in processed_urls: # Skip empty or duplicate URLs in this batch
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Normalize timestamp *before* comparison
|
||||||
|
normalized_added_time = self._normalize_datetime(added_time)
|
||||||
|
if normalized_added_time is None:
|
||||||
|
logger.warning(f"Skipping bookmark with invalid timestamp: {url} - {title}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Compare timestamps after normalization
|
||||||
|
if normalized_added_time.timestamp() > last_timestamp:
|
||||||
|
domain = urllib.parse.urlparse(url).netloc
|
||||||
|
if self.config.is_domain_ignored(domain):
|
||||||
|
# logger.debug(f"Skipping ignored domain for bookmark: {domain}")
|
||||||
|
skipped_ignored += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
new_bookmarks.append((normalized_added_time, url, title, folder, domain))
|
||||||
|
processed_urls.add(url) # Mark URL as processed for this batch
|
||||||
|
|
||||||
|
logger.info(f"Found {len(new_bookmarks)} new bookmarks to process after filtering.")
|
||||||
|
if skipped_ignored > 0:
|
||||||
|
logger.info(f"Skipped {skipped_ignored} bookmarks due to ignored domains.")
|
||||||
|
|
||||||
logger.info(f"Found {len(new_bookmarks)} new bookmarks to process")
|
|
||||||
|
|
||||||
if new_bookmarks:
|
if new_bookmarks:
|
||||||
async with self.db_lock:
|
async with self.db_lock:
|
||||||
|
# Use context manager for session
|
||||||
with next(get_db()) as db:
|
with next(get_db()) as db:
|
||||||
added_count = 0
|
added_count = 0
|
||||||
for added_time, url, title, folder in new_bookmarks:
|
try:
|
||||||
domain = urlparse(url).netloc
|
for norm_added_time, url, title, folder, domain in new_bookmarks:
|
||||||
if self.config.is_domain_ignored(domain):
|
# Optional: Check if bookmark already exists (by URL)
|
||||||
logger.debug(f"Skipping ignored domain: {domain}")
|
# existing = db.query(Bookmark.id).filter(Bookmark.url == url).first()
|
||||||
continue
|
# if existing:
|
||||||
|
# logger.debug(f"Bookmark already exists: {url}")
|
||||||
added_time = self._normalize_datetime(added_time)
|
# continue
|
||||||
|
|
||||||
bookmark = Bookmark(
|
bookmark = Bookmark(
|
||||||
url=url,
|
url=url,
|
||||||
title=title,
|
title=title or "", # Ensure title is not None
|
||||||
added_time=added_time,
|
added_time=norm_added_time,
|
||||||
folder=folder,
|
folder=folder or "", # Ensure folder is not None
|
||||||
domain=domain
|
domain=domain
|
||||||
)
|
)
|
||||||
db.add(bookmark)
|
db.add(bookmark)
|
||||||
added_count += 1
|
added_count += 1
|
||||||
|
|
||||||
|
if added_count > 0:
|
||||||
db.commit()
|
db.commit()
|
||||||
logger.info(f"Successfully added {added_count} new bookmarks")
|
logger.info(f"Successfully committed {added_count} new bookmarks.")
|
||||||
|
# Update timestamp only if new bookmarks were added
|
||||||
update_last_processed_timestamp(source_key, current_timestamp)
|
update_last_processed_timestamp(source_key, current_timestamp)
|
||||||
logger.info(f"Updated last processed timestamp to {current_timestamp}")
|
logger.info(f"Updated last processed bookmark timestamp for '{source_key}' to {current_timestamp}")
|
||||||
|
else:
|
||||||
|
logger.info("No new unique bookmarks to add in this batch.")
|
||||||
|
# Optionally update timestamp even if no *new* bookmarks were added,
|
||||||
|
# to signify the check was performed up to 'current_timestamp'.
|
||||||
|
# update_last_processed_timestamp(source_key, current_timestamp)
|
||||||
|
# logger.info(f"Updated last processed bookmark timestamp check for '{source_key}' to {current_timestamp}")
|
||||||
|
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error committing bookmarks: {str(e)}", exc_info=True)
|
||||||
|
db.rollback()
|
||||||
|
else:
|
||||||
|
logger.info("No new bookmarks found since last check.")
|
||||||
|
# Update timestamp to indicate the check was performed
|
||||||
|
update_last_processed_timestamp(source_key, current_timestamp)
|
||||||
|
logger.info(f"Updated last processed bookmark timestamp check for '{source_key}' to {current_timestamp}")
|
||||||
|
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Error updating bookmarks: {str(e)}", exc_info=True)
|
logger.error(f"Error updating bookmarks: {str(e)}", exc_info=True)
|
||||||
|
|
||||||
|
|
||||||
async def update_history(self):
|
async def update_history(self):
|
||||||
"""Background task to update history periodically"""
|
"""Background task to update history periodically"""
|
||||||
|
# Initial sleep to allow startup tasks (like initial sync) to potentially finish first
|
||||||
|
await asyncio.sleep(10)
|
||||||
while True:
|
while True:
|
||||||
try:
|
try:
|
||||||
current_timestamp = int(datetime.now().timestamp())
|
# Use timezone-aware current time
|
||||||
source_key = "browser_history"
|
current_timestamp_dt = datetime.now(timezone.utc)
|
||||||
last_timestamp = get_last_processed_timestamp(source_key)
|
current_timestamp = int(current_timestamp_dt.timestamp())
|
||||||
|
source_key = "browser_history_scheduler" # Use a different key than initial sync
|
||||||
|
# Ensure last_timestamp is 0 if None
|
||||||
|
last_timestamp = get_last_processed_timestamp(source_key) or 0
|
||||||
|
|
||||||
logger.info(f"Fetching history. Last processed timestamp: {last_timestamp}")
|
logger.info(f"Scheduler: Fetching history. Last processed timestamp (UTC epoch): {last_timestamp}")
|
||||||
history_entries = self.browser_collector.fetch_history()
|
history_entries = self.browser_collector.fetch_history()
|
||||||
logger.info(f"Found {len(history_entries)} total history entries")
|
logger.info(f"Scheduler: Found {len(history_entries)} total history entries from browser.")
|
||||||
|
|
||||||
# Filter for only new entries
|
new_entries = []
|
||||||
new_entries = [
|
skipped_ignored = 0
|
||||||
(visit_time, url, title) for visit_time, url, title in history_entries
|
processed_urls_times = set() # Avoid duplicates within the batch (url, timestamp)
|
||||||
if self._normalize_datetime(visit_time).timestamp() > last_timestamp
|
|
||||||
]
|
|
||||||
|
|
||||||
logger.info(f"Found {len(new_entries)} new history entries to process")
|
for visit_time, url, title in history_entries:
|
||||||
|
# Basic validation
|
||||||
|
if not url or not visit_time:
|
||||||
|
logger.warning(f"Scheduler: Skipping entry with missing URL or timestamp: {title}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Normalize timestamp *before* comparison
|
||||||
|
normalized_visit_time = self._normalize_datetime(visit_time)
|
||||||
|
if normalized_visit_time is None:
|
||||||
|
logger.warning(f"Scheduler: Skipping history with invalid timestamp: {url} - {title}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Compare timestamps after normalization
|
||||||
|
if normalized_visit_time.timestamp() > last_timestamp:
|
||||||
|
entry_key = (url, normalized_visit_time.timestamp())
|
||||||
|
if entry_key in processed_urls_times:
|
||||||
|
continue # Skip duplicate within this batch
|
||||||
|
|
||||||
|
domain = urllib.parse.urlparse(url).netloc
|
||||||
|
if self.config.is_domain_ignored(domain):
|
||||||
|
# logger.debug(f"Scheduler: Skipping ignored domain: {domain}")
|
||||||
|
skipped_ignored += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
new_entries.append((normalized_visit_time, url, title, domain))
|
||||||
|
processed_urls_times.add(entry_key)
|
||||||
|
|
||||||
|
logger.info(f"Scheduler: Found {len(new_entries)} new history entries to process after filtering.")
|
||||||
|
if skipped_ignored > 0:
|
||||||
|
logger.info(f"Scheduler: Skipped {skipped_ignored} history entries due to ignored domains.")
|
||||||
|
|
||||||
if new_entries:
|
if new_entries:
|
||||||
async with self.db_lock:
|
async with self.db_lock:
|
||||||
|
# Use context manager for session
|
||||||
with next(get_db()) as db:
|
with next(get_db()) as db:
|
||||||
added_count = 0
|
added_count = 0
|
||||||
for visit_time, url, title in new_entries:
|
try:
|
||||||
domain = urlparse(url).netloc
|
for norm_visit_time, url, title, domain in new_entries:
|
||||||
if self.config.is_domain_ignored(domain):
|
# Optional: More robust check if entry already exists
|
||||||
logger.debug(f"Skipping ignored domain: {domain}")
|
# existing = db.query(HistoryEntry.id).filter(
|
||||||
continue
|
# HistoryEntry.url == url,
|
||||||
|
# HistoryEntry.visit_time == norm_visit_time
|
||||||
visit_time = self._normalize_datetime(visit_time)
|
# ).first()
|
||||||
|
# if existing:
|
||||||
|
# logger.debug(f"Scheduler: History entry already exists: {url} at {norm_visit_time}")
|
||||||
|
# continue
|
||||||
|
|
||||||
history_entry = HistoryEntry(
|
history_entry = HistoryEntry(
|
||||||
url=url,
|
url=url,
|
||||||
title=title,
|
title=title or "", # Ensure title is not None
|
||||||
visit_time=visit_time,
|
visit_time=norm_visit_time,
|
||||||
domain=domain
|
domain=domain
|
||||||
|
# markdown_content is initially NULL
|
||||||
)
|
)
|
||||||
db.add(history_entry)
|
db.add(history_entry)
|
||||||
added_count += 1
|
added_count += 1
|
||||||
|
|
||||||
|
if added_count > 0:
|
||||||
db.commit()
|
db.commit()
|
||||||
logger.info(f"Successfully added {added_count} new history entries")
|
logger.info(f"Scheduler: Successfully committed {added_count} new history entries.")
|
||||||
|
# Update timestamp only if new entries were added
|
||||||
update_last_processed_timestamp(source_key, current_timestamp)
|
update_last_processed_timestamp(source_key, current_timestamp)
|
||||||
logger.info(f"Updated last processed timestamp to {current_timestamp}")
|
logger.info(f"Scheduler: Updated last processed history timestamp for '{source_key}' to {current_timestamp}")
|
||||||
|
else:
|
||||||
|
logger.info("Scheduler: No new unique history entries to add in this batch.")
|
||||||
|
# Optionally update timestamp even if no *new* entries were added
|
||||||
|
# update_last_processed_timestamp(source_key, current_timestamp)
|
||||||
|
# logger.info(f"Scheduler: Updated last processed history timestamp check for '{source_key}' to {current_timestamp}")
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Error updating history: {str(e)}", exc_info=True)
|
logger.error(f"Scheduler: Error committing history: {str(e)}", exc_info=True)
|
||||||
|
db.rollback()
|
||||||
|
else:
|
||||||
|
logger.info("Scheduler: No new history entries found since last check.")
|
||||||
|
# Update timestamp to indicate the check was performed
|
||||||
|
update_last_processed_timestamp(source_key, current_timestamp)
|
||||||
|
logger.info(f"Scheduler: Updated last processed history timestamp check for '{source_key}' to {current_timestamp}")
|
||||||
|
|
||||||
await asyncio.sleep(300) # Wait 5 minutes before next update
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Scheduler: Error in update_history loop: {str(e)}", exc_info=True)
|
||||||
|
|
||||||
|
# --- Access config value using property ---
|
||||||
|
try:
|
||||||
|
# Use direct attribute access via the @property
|
||||||
|
wait_time = self.config.history_update_interval_seconds
|
||||||
|
except Exception as config_err:
|
||||||
|
logger.error(f"Scheduler (History): Error accessing config for wait time, using default 300s. Error: {config_err}")
|
||||||
|
wait_time = 300
|
||||||
|
# --- End Access ---
|
||||||
|
|
||||||
|
logger.debug(f"Scheduler (History): Sleeping for {wait_time} seconds.")
|
||||||
|
await asyncio.sleep(wait_time) # Use the obtained wait_time
|
||||||
|
|
||||||
|
async def _process_markdown_batch(self):
|
||||||
|
"""Fetches and processes one batch (up to 10) of history entries needing markdown."""
|
||||||
|
entries_to_process = []
|
||||||
|
try:
|
||||||
|
# --- Query for entries (inside DB lock/session) ---
|
||||||
|
async with self.db_lock:
|
||||||
|
with next(get_db()) as db:
|
||||||
|
# Find up to 10 entries where markdown_content is NULL or empty string
|
||||||
|
entries_to_process = db.query(HistoryEntry).filter(
|
||||||
|
or_(HistoryEntry.markdown_content == None, HistoryEntry.markdown_content == '')
|
||||||
|
).order_by(HistoryEntry.visit_time.asc()).limit(10).all()
|
||||||
|
|
||||||
|
if entries_to_process:
|
||||||
|
logger.info(f"Markdown Processor: Found {len(entries_to_process)} entries to process in this batch.")
|
||||||
|
for entry in entries_to_process:
|
||||||
|
db.expunge(entry) # Detach before async operations
|
||||||
|
else:
|
||||||
|
logger.info("Markdown Processor: No history entries found needing markdown update in this batch.")
|
||||||
|
return # Nothing to do in this batch
|
||||||
|
|
||||||
|
|
||||||
|
# --- Crawling and Updating (outside the DB lock/session) ---
|
||||||
|
processed_count = 0
|
||||||
|
skipped_ignored = 0
|
||||||
|
for entry in entries_to_process:
|
||||||
|
markdown_content = None
|
||||||
|
crawl_success = False
|
||||||
|
should_update_db = False
|
||||||
|
|
||||||
|
# --- ADD DOMAIN CHECK ---
|
||||||
|
try:
|
||||||
|
# +++ Add Debugging Lines +++
|
||||||
|
logger.debug(f"Debugging urllib.parse type: {type(urllib.parse)}")
|
||||||
|
logger.debug(f"Is 'urlparse' in urllib.parse? {'urlparse' in dir(urllib.parse)}")
|
||||||
|
# +++ End Debugging Lines +++
|
||||||
|
|
||||||
|
domain = urllib.parse.urlparse(entry.url).netloc
|
||||||
|
if self.config.is_domain_ignored(domain):
|
||||||
|
logger.debug(f"Markdown Processor: Skipping ignored domain: {domain} for URL: {entry.url} (ID={entry.id})")
|
||||||
|
skipped_ignored += 1
|
||||||
|
continue
|
||||||
|
except Exception as parse_err:
|
||||||
|
logger.warning(f"Markdown Processor: Error parsing URL to get domain: {entry.url} (ID={entry.id}). Type={type(parse_err).__name__} Error: {parse_err}. Skipping entry.")
|
||||||
|
continue
|
||||||
|
# --- END DOMAIN CHECK ---
|
||||||
|
|
||||||
|
|
||||||
|
try:
|
||||||
|
logger.info(f"Markdown Processor: Crawling URL: {entry.url} (ID={entry.id})")
|
||||||
|
if not self.crawler:
|
||||||
|
logger.error("Markdown Processor: Crawler not initialized!")
|
||||||
|
break # Stop processing this batch if crawler is missing
|
||||||
|
|
||||||
|
result = await self.crawler.arun(url=entry.url)
|
||||||
|
|
||||||
|
if result and result.markdown:
|
||||||
|
markdown_content = result.markdown
|
||||||
|
crawl_success = True
|
||||||
|
logger.info(f"Markdown Processor: Successfully crawled and got markdown for ID={entry.id}.")
|
||||||
|
else:
|
||||||
|
logger.warning(f"Markdown Processor: Crawling completed but no markdown content found for ID={entry.id}, URL={entry.url}")
|
||||||
|
markdown_content = "" # Mark as processed without content
|
||||||
|
crawl_success = True
|
||||||
|
|
||||||
|
should_update_db = True
|
||||||
|
|
||||||
|
except Exception as crawl_error:
|
||||||
|
logger.error(f"Markdown Processor: Error crawling URL {entry.url} (ID={entry.id}) Type={type(crawl_error).__name__}: {crawl_error}", exc_info=False)
|
||||||
|
should_update_db = False # Don't update DB on crawl error
|
||||||
|
|
||||||
|
# --- Update DB for this specific entry ---
|
||||||
|
if should_update_db:
|
||||||
|
try:
|
||||||
|
async with self.db_lock:
|
||||||
|
with next(get_db()) as db_update:
|
||||||
|
stmt = (
|
||||||
|
update(HistoryEntry)
|
||||||
|
.where(HistoryEntry.id == entry.id)
|
||||||
|
.values(markdown_content=markdown_content)
|
||||||
|
)
|
||||||
|
result_proxy = db_update.execute(stmt)
|
||||||
|
if result_proxy.rowcount > 0:
|
||||||
|
db_update.commit()
|
||||||
|
# Adjust log message based on whether it was skipped or processed
|
||||||
|
if markdown_content == "" and crawl_success and not result.markdown: # Check if marked empty due to no content
|
||||||
|
logger.info(f"Markdown Processor: Marked entry as processed (no content found) for ID={entry.id}.")
|
||||||
|
elif crawl_success:
|
||||||
|
logger.info(f"Markdown Processor: Successfully updated markdown status for ID={entry.id}.")
|
||||||
|
|
||||||
|
# Only increment processed_count if actual content was added or marked empty after crawl
|
||||||
|
if markdown_content is not None: # Includes actual markdown or empty string marker
|
||||||
|
processed_count += 1
|
||||||
|
else:
|
||||||
|
logger.warning(f"Markdown Processor: Could not find entry ID={entry.id} to update markdown status (rowcount 0).")
|
||||||
|
db_update.rollback()
|
||||||
|
except Exception as db_update_error:
|
||||||
|
logger.error(f"Markdown Processor: Error updating database for ID={entry.id}: {db_update_error}", exc_info=True)
|
||||||
|
|
||||||
|
log_suffix = f"Updated {processed_count}"
|
||||||
|
if skipped_ignored > 0:
|
||||||
|
log_suffix += f", Skipped {skipped_ignored} (ignored domain)"
|
||||||
|
log_suffix += f" out of {len(entries_to_process)} entries in this batch."
|
||||||
|
logger.info(f"Markdown Processor: Finished processing batch. {log_suffix}")
|
||||||
|
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Markdown Processor: Error processing markdown batch: {str(e)}", exc_info=True)
|
||||||
|
|
||||||
|
|
||||||
|
async def update_missing_markdown_periodically(self):
|
||||||
|
"""Periodically triggers the processing of batches of history entries needing markdown."""
|
||||||
|
# Initial slight delay to ensure startup tasks settle
|
||||||
|
await asyncio.sleep(15)
|
||||||
|
logger.info("Starting periodic markdown update task...")
|
||||||
|
while True:
|
||||||
|
await self._process_markdown_batch() # Process one batch
|
||||||
|
|
||||||
|
# Wait before checking for the next batch
|
||||||
|
# --- Access config value using property ---
|
||||||
|
try:
|
||||||
|
# Use direct attribute access via the @property
|
||||||
|
wait_time = self.config.markdown_update_interval_seconds
|
||||||
|
except Exception as config_err:
|
||||||
|
logger.error(f"Periodic Markdown Updater: Error accessing config for wait time, using default 300s. Error: {config_err}")
|
||||||
|
wait_time = 300
|
||||||
|
# --- End Access ---
|
||||||
|
|
||||||
|
logger.debug(f"Periodic Markdown Updater: Sleeping for {wait_time} seconds before next batch.")
|
||||||
|
await asyncio.sleep(wait_time)
|
||||||
|
|
||||||
async def close(self):
|
async def close(self):
|
||||||
"""Cleanup resources"""
|
"""Cleanup resources"""
|
||||||
await self.page_reader.close()
|
logger.info("Closing scheduler resources...")
|
||||||
|
# Add any specific cleanup needed for BrowserHistoryCollector if necessary
|
||||||
|
# The crawler is managed and closed (if needed) in main.py's shutdown
|
||||||
|
pass
|
||||||
45
app/utils.py
Normal file
45
app/utils.py
Normal file
@@ -0,0 +1,45 @@
|
|||||||
|
from datetime import datetime
|
||||||
|
from .database import HistoryEntry, Bookmark
|
||||||
|
|
||||||
|
def serialize_history_entry(entry, include_content: bool = False):
|
||||||
|
"""Serialize a HistoryEntry object or raw SQL result to a dictionary"""
|
||||||
|
# Handle both ORM objects and raw SQL results
|
||||||
|
if hasattr(entry, '_mapping'): # Raw SQL result (from execute)
|
||||||
|
result = {
|
||||||
|
"id": entry.id,
|
||||||
|
"url": entry.url,
|
||||||
|
"title": entry.title,
|
||||||
|
"visit_time": entry.visit_time.isoformat() if isinstance(entry.visit_time, datetime) else entry.visit_time,
|
||||||
|
"domain": entry.domain,
|
||||||
|
# Add potential highlight fields if they exist
|
||||||
|
"title_highlight": getattr(entry, 'title_highlight', None),
|
||||||
|
"content_highlight": getattr(entry, 'content_highlight', None),
|
||||||
|
"rank": getattr(entry, 'rank', None)
|
||||||
|
}
|
||||||
|
if include_content:
|
||||||
|
# Ensure markdown_content exists before accessing
|
||||||
|
result["markdown_content"] = getattr(entry, 'markdown_content', None)
|
||||||
|
|
||||||
|
else: # ORM object (from query)
|
||||||
|
result = {
|
||||||
|
"id": entry.id,
|
||||||
|
"url": entry.url,
|
||||||
|
"title": entry.title,
|
||||||
|
"visit_time": entry.visit_time.isoformat() if entry.visit_time else None,
|
||||||
|
"domain": entry.domain,
|
||||||
|
}
|
||||||
|
if include_content:
|
||||||
|
result["markdown_content"] = entry.markdown_content
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
def serialize_bookmark(bookmark):
|
||||||
|
"""Serialize a Bookmark object to a dictionary"""
|
||||||
|
return {
|
||||||
|
"id": bookmark.id,
|
||||||
|
"url": bookmark.url,
|
||||||
|
"title": bookmark.title,
|
||||||
|
"added_time": bookmark.added_time.isoformat() if bookmark.added_time else None,
|
||||||
|
"folder": bookmark.folder,
|
||||||
|
"domain": bookmark.domain,
|
||||||
|
}
|
||||||
@@ -4,13 +4,19 @@ excluded_domains:
|
|||||||
- localhost
|
- localhost
|
||||||
- 127.0.0.1
|
- 127.0.0.1
|
||||||
|
|
||||||
# IP ranges
|
# Specific Domains / Subdomains
|
||||||
|
- ap.www.namecheap.com # Ignore this specific subdomain
|
||||||
|
- www.namecheap.com # Ignore the main domain (will cover /twofa/* path implicitly)
|
||||||
|
- login.linode.com # Ignore the login subdomain
|
||||||
|
|
||||||
|
# IP ranges (requires wildcard matching in config.py)
|
||||||
- 192.168.*.*
|
- 192.168.*.*
|
||||||
- 10.*.*.*
|
- 10.*.*.*
|
||||||
- 172.16.*.*
|
- 172.16.*.*
|
||||||
- "0.0.0.*"
|
- 0.0.0.* # Note: Be careful with overly broad patterns
|
||||||
|
|
||||||
# Example wildcard patterns
|
|
||||||
|
# Example wildcard patterns (requires wildcard matching in config.py)
|
||||||
# - *.local
|
# - *.local
|
||||||
# - reddit-*.com
|
|
||||||
# - *.githubusercontent.com
|
# - *.githubusercontent.com
|
||||||
|
# - *.google.com # Example: Ignore all google subdomains
|
||||||
|
|||||||
20
pyproject.toml
Normal file
20
pyproject.toml
Normal file
@@ -0,0 +1,20 @@
|
|||||||
|
[project]
|
||||||
|
name = "browser-recall"
|
||||||
|
version = "0.1.0"
|
||||||
|
description = "Add your description here"
|
||||||
|
readme = "README.md"
|
||||||
|
requires-python = ">=3.10.16"
|
||||||
|
dependencies = [
|
||||||
|
"crawl4ai",
|
||||||
|
"fastapi",
|
||||||
|
"sqlalchemy",
|
||||||
|
"uvicorn",
|
||||||
|
"pytz",
|
||||||
|
"aiofiles",
|
||||||
|
"websockets",
|
||||||
|
"pyyaml",
|
||||||
|
"browser-history",
|
||||||
|
"pydantic",
|
||||||
|
"pydantic-settings",
|
||||||
|
"iso8601",
|
||||||
|
]
|
||||||
Reference in New Issue
Block a user