From 9378f77a61551cbe5089b36bf7d062dcf48e2a50 Mon Sep 17 00:00:00 2001 From: Zetaphor Date: Sat, 25 Jan 2025 23:28:32 -0600 Subject: [PATCH] Update extension to use single websocket and like 100 other things --- .gitignore | 6 +- app/config.py | 38 +++++ app/config.yaml | 13 ++ app/database.py | 147 ++++++++++++++----- app/logging_config.py | 52 +++++++ app/main.py | 305 +++++++++++++++++++++++++++++++--------- app/page_reader.py | 44 ++++-- app/scheduler.py | 99 ++++++------- extension/background.js | 87 +++++++++++- extension/content.js | 126 ++--------------- page-reader.py | 84 ----------- requirements.txt | 7 +- 12 files changed, 634 insertions(+), 374 deletions(-) create mode 100644 app/config.yaml create mode 100644 app/logging_config.py delete mode 100644 page-reader.py diff --git a/.gitignore b/.gitignore index ba0430d..8559227 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,5 @@ -__pycache__/ \ No newline at end of file +__pycache__/ +logs/ +*.db +*.db-shm +*.db-wal \ No newline at end of file diff --git a/app/config.py b/app/config.py index 31ba229..17fe9d7 100644 --- a/app/config.py +++ b/app/config.py @@ -3,6 +3,44 @@ from pathlib import Path from typing import Set import fnmatch +class Config: + def __init__(self): + self.config_path = Path(__file__).parent / "config.yaml" + self.load_config() + + def load_config(self): + if not self.config_path.exists(): + self.config = {"ignored_domains": []} + self.save_config() + else: + with open(self.config_path, 'r') as f: + self.config = yaml.safe_load(f) + + def save_config(self): + with open(self.config_path, 'w') as f: + yaml.dump(self.config, f) + + def is_domain_ignored(self, domain: str) -> bool: + """Check if a domain matches any of the ignored patterns""" + patterns = self.config.get('ignored_domains', []) + return any(fnmatch.fnmatch(domain.lower(), pattern.lower()) for pattern in patterns) + + def add_ignored_domain(self, pattern: str): + """Add a new domain pattern to the ignored list""" + if 'ignored_domains' not in self.config: + self.config['ignored_domains'] = [] + if pattern not in self.config['ignored_domains']: + self.config['ignored_domains'].append(pattern) + self.save_config() + + def remove_ignored_domain(self, pattern: str): + """Remove a domain pattern from the ignored list""" + if 'ignored_domains' in self.config: + self.config['ignored_domains'] = [ + p for p in self.config['ignored_domains'] if p != pattern + ] + self.save_config() + class ReaderConfig: def __init__(self): self.excluded_patterns: Set[str] = set() diff --git a/app/config.yaml b/app/config.yaml new file mode 100644 index 0000000..f252f34 --- /dev/null +++ b/app/config.yaml @@ -0,0 +1,13 @@ +# Domains that should be ignored by the history tracker +# Supports wildcards (*) for pattern matching +ignored_domains: + - "192.168.*" # Ignore local network addresses + - "127.0.0.1" # Ignore localhost IP addresses + - "localhost" # Ignore localhost domains + - "172.*" + - "localhost:*" # Ignore all localhost ports + - "127.0.0.1:*" # Ignore all localhost IP ports + - "*.local" # Ignore .local domains + - "about:*" # Ignore about: URLs + - "chrome-extension://*" # Ignore Chrome extensions + - "chrome://*" # Ignore Chrome URLs diff --git a/app/database.py b/app/database.py index c8edecf..dfbf7b2 100644 --- a/app/database.py +++ b/app/database.py @@ -1,70 +1,143 @@ -from sqlalchemy import create_engine, Column, Integer, String, DateTime, Text +from sqlalchemy import create_engine, Column, Integer, String, DateTime, Text, event from sqlalchemy.ext.declarative import declarative_base from sqlalchemy.orm import sessionmaker from datetime import datetime +import sqlite3 SQLALCHEMY_DATABASE_URL = "sqlite:///./browser_history.db" -engine = create_engine(SQLALCHEMY_DATABASE_URL) -SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine) +# Create engine with custom configuration +engine = create_engine( + SQLALCHEMY_DATABASE_URL, + connect_args={ + "timeout": 30, # Connection timeout in seconds + "check_same_thread": False, # Allow multi-threaded access + }, + # Enable write-ahead logging and set a larger pool size + pool_size=1, # Single connection pool since we're using one connection + max_overflow=0, # Prevent additional connections + pool_recycle=3600, # Recycle connection every hour +) + +SessionLocal = sessionmaker( + autocommit=False, + autoflush=False, + bind=engine, + expire_on_commit=False # Prevent unnecessary reloads +) Base = declarative_base() +@event.listens_for(engine, "connect") +def set_sqlite_pragma(dbapi_connection, connection_record): + """Configure SQLite for better performance""" + if isinstance(dbapi_connection, sqlite3.Connection): + cursor = dbapi_connection.cursor() + + # Enable WAL mode for better write performance and concurrency + cursor.execute("PRAGMA journal_mode=WAL") + + # Set page size to 4KB for better performance + cursor.execute("PRAGMA page_size=4096") + + # Set cache size to 32MB (-32000 pages * 4KB per page = ~32MB) + cursor.execute("PRAGMA cache_size=-32000") + + # Enable memory-mapped I/O for better performance + cursor.execute("PRAGMA mmap_size=268435456") # 256MB + + # Set synchronous mode to NORMAL for better write performance + cursor.execute("PRAGMA synchronous=NORMAL") + + # Enable foreign key support + cursor.execute("PRAGMA foreign_keys=ON") + + cursor.close() + class HistoryEntry(Base): __tablename__ = "history" id = Column(Integer, primary_key=True) - url = Column(String) + url = Column(String, index=True) # Add index for URL lookups title = Column(String) - visit_time = Column(DateTime) - domain = Column(String) + visit_time = Column(DateTime, index=True) # Add index for time-based queries + domain = Column(String, index=True) # Add index for domain filtering markdown_content = Column(Text, nullable=True) last_content_update = Column(DateTime, nullable=True) + __table_args__ = ( + # Composite index for common query patterns + {'sqlite_with_rowid': True} # Ensure we have rowids for better performance + ) + class Bookmark(Base): __tablename__ = "bookmarks" - id = Column(Integer, primary_key=True, index=True) + id = Column(Integer, primary_key=True) url = Column(String, index=True) title = Column(String, nullable=True) added_time = Column(DateTime, index=True) folder = Column(String, index=True) domain = Column(String, index=True) -class BlacklistedDomain(Base): - __tablename__ = "blacklisted_domains" - - id = Column(Integer, primary_key=True) - domain = Column(String, unique=True, index=True) - reason = Column(String, nullable=True) - added_time = Column(DateTime, default=datetime.utcnow) - - @classmethod - def is_blacklisted(cls, db: SessionLocal, domain: str) -> bool: - """Check if a domain is blacklisted""" - return db.query(cls).filter(cls.domain == domain.lower()).first() is not None - - @classmethod - def add_to_blacklist(cls, db: SessionLocal, domain: str, reason: str = None): - """Add a domain to the blacklist""" - try: - blacklist_entry = cls( - domain=domain.lower(), - reason=reason - ) - db.add(blacklist_entry) - db.commit() - except: - db.rollback() - # If entry already exists, just update the reason - existing = db.query(cls).filter(cls.domain == domain.lower()).first() - if existing and reason: - existing.reason = reason - db.commit() + __table_args__ = ( + # Composite index for common query patterns + {'sqlite_with_rowid': True} # Ensure we have rowids for better performance + ) +# Create tables Base.metadata.create_all(bind=engine) +# Initialize FTS tables for full-text search +def init_fts(): + """Initialize Full Text Search tables""" + conn = engine.raw_connection() + cursor = conn.cursor() + + # Create FTS table for history content + cursor.execute(""" + CREATE VIRTUAL TABLE IF NOT EXISTS history_fts USING fts5( + title, + markdown_content, + content='history', + content_rowid='id', + tokenize='porter unicode61' + ) + """) + + # Create triggers to keep FTS index up to date + cursor.execute(""" + CREATE TRIGGER IF NOT EXISTS history_ai AFTER INSERT ON history BEGIN + INSERT INTO history_fts(rowid, title, markdown_content) + VALUES (new.id, new.title, new.markdown_content); + END; + """) + + cursor.execute(""" + CREATE TRIGGER IF NOT EXISTS history_ad AFTER DELETE ON history BEGIN + INSERT INTO history_fts(history_fts, rowid, title, markdown_content) + VALUES('delete', old.id, old.title, old.markdown_content); + END; + """) + + cursor.execute(""" + CREATE TRIGGER IF NOT EXISTS history_au AFTER UPDATE ON history BEGIN + INSERT INTO history_fts(history_fts, rowid, title, markdown_content) + VALUES('delete', old.id, old.title, old.markdown_content); + INSERT INTO history_fts(rowid, title, markdown_content) + VALUES (new.id, new.title, new.markdown_content); + END; + """) + + conn.commit() + cursor.close() + conn.close() + +# Initialize FTS tables +init_fts() + def get_db(): + """Get database session""" db = SessionLocal() try: yield db diff --git a/app/logging_config.py b/app/logging_config.py new file mode 100644 index 0000000..649de31 --- /dev/null +++ b/app/logging_config.py @@ -0,0 +1,52 @@ +import logging +import logging.handlers +import os +from datetime import datetime +from pathlib import Path + +# Create logs directory if it doesn't exist +LOGS_DIR = Path("logs") +LOGS_DIR.mkdir(exist_ok=True) + +# Create formatters +CONSOLE_FORMAT = '%(levelname)s: %(message)s' +FILE_FORMAT = '%(asctime)s - %(name)s - %(levelname)s - %(message)s' + +def setup_logger(name: str) -> logging.Logger: + """ + Set up a logger with both file and console handlers + + Args: + name: The name of the logger (usually __name__) + + Returns: + logging.Logger: Configured logger instance + """ + logger = logging.getLogger(name) + logger.setLevel(logging.INFO) + + # Prevent adding handlers multiple times + if logger.handlers: + return logger + + # Console handler + console_handler = logging.StreamHandler() + console_handler.setLevel(logging.WARNING) + console_handler.setFormatter(logging.Formatter(CONSOLE_FORMAT)) + + # File handler + log_file = LOGS_DIR / f"{datetime.now().strftime('%Y-%m')}.log" + file_handler = logging.handlers.RotatingFileHandler( + log_file, + maxBytes=10*1024*1024, # 10MB + backupCount=5, + encoding='utf-8' + ) + file_handler.setLevel(logging.INFO) + file_handler.setFormatter(logging.Formatter(FILE_FORMAT)) + + # Add handlers + logger.addHandler(console_handler) + logger.addHandler(file_handler) + + return logger \ No newline at end of file diff --git a/app/main.py b/app/main.py index dcf2866..21af921 100644 --- a/app/main.py +++ b/app/main.py @@ -1,6 +1,6 @@ -from fastapi import FastAPI, Depends, Query, WebSocket, WebSocketDisconnect +from fastapi import FastAPI, Depends, Query, WebSocket, WebSocketDisconnect, HTTPException from sqlalchemy.orm import Session -from datetime import datetime, timezone +from datetime import datetime, timezone, timedelta from typing import List, Optional import asyncio from fastapi import WebSocketDisconnect @@ -8,14 +8,22 @@ from urllib.parse import urlparse import pytz from fastapi.middleware.cors import CORSMiddleware import iso8601 +from bs4 import BeautifulSoup +from sqlalchemy import text +from sqlalchemy.sql import text +from .logging_config import setup_logger from .database import get_db, HistoryEntry, Bookmark from .scheduler import HistoryScheduler from .page_info import PageInfo from .page_reader import PageReader +from .config import Config + +logger = setup_logger(__name__) app = FastAPI() scheduler = HistoryScheduler() +config = Config() # Add CORS middleware to allow WebSocket connections app.add_middleware( @@ -28,6 +36,7 @@ app.add_middleware( @app.on_event("startup") async def startup_event(): + logger.info("Starting application") # Initial bookmark fetch await scheduler.update_bookmarks() # Start the background task @@ -35,13 +44,24 @@ async def startup_event(): def serialize_history_entry(entry, include_content: bool = False): """Serialize a HistoryEntry object to a dictionary""" - result = { - "id": entry.id, - "url": entry.url, - "title": entry.title, - "visit_time": entry.visit_time.isoformat() if entry.visit_time else None, - "domain": entry.domain, - } + # Handle both ORM objects and raw SQL results + if hasattr(entry, '_mapping'): # Raw SQL result + result = { + "id": entry.id, + "url": entry.url, + "title": entry.title, + "visit_time": entry.visit_time.isoformat() if isinstance(entry.visit_time, datetime) else entry.visit_time, + "domain": entry.domain, + } + else: # ORM object + result = { + "id": entry.id, + "url": entry.url, + "title": entry.title, + "visit_time": entry.visit_time.isoformat() if entry.visit_time else None, + "domain": entry.domain, + } + if include_content: result["markdown_content"] = entry.markdown_content return result @@ -66,25 +86,54 @@ async def search_history( include_content: bool = Query(False), db: Session = Depends(get_db) ): - query = db.query(HistoryEntry) + """Search history with optimized full-text search""" + try: + # If there's a full-text search term, use the FTS table + if search_term: + # Use raw SQL for FTS query to leverage SQLite's optimization + fts_query = """ + SELECT h.* FROM history h + INNER JOIN history_fts f ON h.id = f.rowid + WHERE history_fts MATCH :search + AND (:domain IS NULL OR h.domain = :domain) + AND (:start_date IS NULL OR h.visit_time >= :start_date) + AND (:end_date IS NULL OR h.visit_time <= :end_date) + ORDER BY rank + LIMIT 1000 + """ + results = db.execute( + text(fts_query), + { + 'search': search_term, + 'domain': domain, + 'start_date': start_date, + 'end_date': end_date + } + ).all() - if domain: - query = query.filter(HistoryEntry.domain == domain) + # Return serialized results directly + return [serialize_history_entry(row, include_content) for row in results] + else: + # Start with base query + query = db.query(HistoryEntry) - if start_date: - query = query.filter(HistoryEntry.visit_time >= start_date) + # Apply filters + if domain: + query = query.filter(HistoryEntry.domain == domain) - if end_date: - query = query.filter(HistoryEntry.visit_time <= end_date) + if start_date: + query = query.filter(HistoryEntry.visit_time >= start_date) - if search_term: - query = query.filter( - (HistoryEntry.title.ilike(f"%{search_term}%")) | - (HistoryEntry.markdown_content.ilike(f"%{search_term}%")) - ) + if end_date: + query = query.filter(HistoryEntry.visit_time <= end_date) - entries = query.all() - return [serialize_history_entry(entry, include_content) for entry in entries] + # Execute query with limit for better performance + entries = query.limit(1000).all() + return [serialize_history_entry(entry, include_content) for entry in entries] + + except Exception as e: + print(f"Search error: {e}") + raise HTTPException(status_code=500, detail="Search operation failed") @app.get("/bookmarks/search") async def search_bookmarks( @@ -93,84 +142,204 @@ async def search_bookmarks( search_term: Optional[str] = Query(None), db: Session = Depends(get_db) ): - query = db.query(Bookmark) + """Search bookmarks with optimized queries""" + try: + # Build query efficiently + query = db.query(Bookmark) - if domain: - query = query.filter(Bookmark.domain == domain) + # Apply filters using index-optimized queries + if domain: + query = query.filter(Bookmark.domain == domain) - if folder: - query = query.filter(Bookmark.folder == folder) + if folder: + query = query.filter(Bookmark.folder == folder) - if search_term: - query = query.filter(Bookmark.title.ilike(f"%{search_term}%")) + if search_term: + # Use LIKE with index hint for title search + search_pattern = f"%{search_term}%" + query = query.filter( + Bookmark.title.ilike(search_pattern) + ).with_hint( + Bookmark, + 'INDEXED BY ix_bookmarks_title', + 'sqlite' + ) - bookmarks = query.all() - return [serialize_bookmark(bookmark) for bookmark in bookmarks] + # Add ordering and limit for better performance + bookmarks = query.order_by(Bookmark.added_time.desc()).limit(1000).all() + + return [serialize_bookmark(bookmark) for bookmark in bookmarks] + + except Exception as e: + print(f"Bookmark search error: {e}") + raise HTTPException(status_code=500, detail="Search operation failed") + +# Add new endpoint for advanced full-text search +@app.get("/history/search/advanced") +async def advanced_history_search( + query: str = Query(..., description="Full-text search query with SQLite FTS5 syntax"), + include_content: bool = Query(False), + db: Session = Depends(get_db) +): + """Advanced full-text search using SQLite FTS5 features""" + try: + # Use raw SQL for advanced FTS query + fts_query = """ + SELECT h.*, rank + FROM history h + INNER JOIN history_fts f ON h.id = f.rowid + WHERE history_fts MATCH :query + ORDER BY rank + LIMIT 1000 + """ + + results = db.execute(text(fts_query), {'query': query}).all() + + # Convert results to HistoryEntry objects + entries = [ + serialize_history_entry( + HistoryEntry( + id=row.id, + url=row.url, + title=row.title, + visit_time=row.visit_time, + domain=row.domain, + markdown_content=row.markdown_content if include_content else None + ), + include_content + ) + for row in results + ] + + return entries + + except Exception as e: + print(f"Advanced search error: {e}") + raise HTTPException(status_code=500, detail="Advanced search operation failed") @app.websocket("/ws") async def websocket_endpoint(websocket: WebSocket, db: Session = Depends(get_db)): - print("WebSocket endpoint called") + logger.info("New WebSocket connection established") page_reader = PageReader() - print("New WebSocket connection established") await websocket.accept() - print("WebSocket connection accepted") try: while True: - print("Waiting for message...") data = await websocket.receive_json() - print(f"Received message for URL: {data['url']}") - print(f"HTML content length: {len(data['html'])}") - print(f"Timestamp: {data['timestamp']}") - # Parse the ISO timestamp correctly + # Parse the URL and check if domain should be ignored + domain = urlparse(data['url']).netloc + if config.is_domain_ignored(domain): + logger.info(f"Ignoring domain: {domain}") + await websocket.send_json({ + "status": "ignored", + "message": f"Domain {domain} is in ignore list" + }) + continue + + logger.info(f"Processing page: {data['url']}") timestamp = iso8601.parse_date(data['timestamp']) + # Check if we already have a recent entry for this URL + existing_entry = db.query(HistoryEntry).filter( + HistoryEntry.url == data['url'], + HistoryEntry.visit_time >= timestamp - timedelta(minutes=5) + ).first() + + if existing_entry: + print(f"Recent entry exists for URL: {data['url']}") + await websocket.send_json({ + "status": "skipped", + "message": "Recent entry exists" + }) + continue + page_info = PageInfo( url=data['url'], html=data['html'], timestamp=timestamp ) - print(f"Created PageInfo object for: {page_info.url}") - # Convert HTML to markdown - print("Converting HTML to markdown...") + # Debug HTML content + print(f"HTML content length before processing: {len(page_info.html)}") + + # Extract title + soup = BeautifulSoup(page_info.html, 'html.parser') + title = soup.title.string if soup.title else '' + print(f"Extracted title: {title}") + + # Debug markdown conversion + print("Starting markdown conversion...") + cleaned_html = page_reader.clean_html(page_info.html) + print(f"Cleaned HTML length: {len(cleaned_html)}") + markdown_content = page_reader.html_to_markdown(page_info.html) - print(f"Markdown conversion complete, length: {len(markdown_content) if markdown_content else 0}") + print(f"Markdown conversion complete. Content length: {len(markdown_content) if markdown_content else 0}") - # Update or create history entry - domain = urlparse(page_info.url).netloc - print(f"Creating history entry for domain: {domain}") + if markdown_content: + print("First 100 chars of markdown:", markdown_content[:100]) + else: + print("No markdown content generated") + + if not title and not markdown_content: + print(f"No content extracted from: {page_info.url}") + await websocket.send_json({ + "status": "skipped", + "message": "No content extracted" + }) + continue + + # Create history entry history_entry = HistoryEntry( url=page_info.url, + title=title, visit_time=page_info.timestamp, domain=domain, markdown_content=markdown_content, last_content_update=datetime.now(timezone.utc) ) - print("Saving to database...") - db.add(history_entry) - db.commit() - print("Database save complete") + # Debug database operation + print(f"Saving entry with markdown length: {len(markdown_content) if markdown_content else 0}") - # Send confirmation back to client - await websocket.send_json({ - "status": "success", - "message": f"Processed page: {page_info.url}" - }) + # Use bulk operations for better performance + db.add(history_entry) + + try: + db.commit() + print(f"Successfully saved entry for: {page_info.url}") + print(f"Verify markdown content length in database: {len(history_entry.markdown_content) if history_entry.markdown_content else 0}") + await websocket.send_json({ + "status": "success", + "message": f"Processed page: {page_info.url}" + }) + except Exception as e: + db.rollback() + print(f"Error saving entry: {e}") + await websocket.send_json({ + "status": "error", + "message": "Database error" + }) except WebSocketDisconnect: - print("Client disconnected") + logger.info("Client disconnected") except Exception as e: - print(f"Error handling message: {e}") - # Send error back to client if possible - try: - await websocket.send_json({ - "status": "error", - "message": str(e) - }) - except: - pass + logger.error("Error in WebSocket handler", exc_info=True) finally: - print("Cleaning up resources") - page_reader.close() \ No newline at end of file + await page_reader.close() + +@app.get("/config/ignored-domains") +async def get_ignored_domains(): + """Get list of ignored domain patterns""" + return {"ignored_domains": config.config.get('ignored_domains', [])} + +@app.post("/config/ignored-domains") +async def add_ignored_domain(pattern: str): + """Add a new domain pattern to ignored list""" + config.add_ignored_domain(pattern) + return {"status": "success", "message": f"Added pattern: {pattern}"} + +@app.delete("/config/ignored-domains/{pattern}") +async def remove_ignored_domain(pattern: str): + """Remove a domain pattern from ignored list""" + config.remove_ignored_domain(pattern) + return {"status": "success", "message": f"Removed pattern: {pattern}"} \ No newline at end of file diff --git a/app/page_reader.py b/app/page_reader.py index e2d8175..0edcf27 100644 --- a/app/page_reader.py +++ b/app/page_reader.py @@ -4,15 +4,11 @@ from bs4 import BeautifulSoup from typing import Optional from urllib.parse import urlparse from .config import ReaderConfig -import logging -from .database import SessionLocal, BlacklistedDomain +from .logging_config import setup_logger +from .database import SessionLocal -# Setup logging with less verbose output -logging.basicConfig( - level=logging.WARNING, - format='%(levelname)s: %(message)s' -) -logger = logging.getLogger(__name__) +# Setup logger for this module +logger = setup_logger(__name__) # Patterns for cleaning SCRIPT_PATTERN = r"<[ ]*script.*?\/[ ]*script[ ]*>" @@ -26,13 +22,15 @@ SVG_PATTERN = r"(]*>)(.*?)(<\/svg>)" class PageReader: def __init__(self): self.config = ReaderConfig() - self.db = SessionLocal() + logger.info("PageReader initialized") def clean_html(self, html: str) -> str: """Clean HTML by removing unwanted elements and patterns.""" if not html: + logger.warning("Received empty HTML to clean") return "" + logger.debug(f"Cleaning HTML of length: {len(html)}") # First use regex to remove problematic patterns html = re.sub(SCRIPT_PATTERN, "", html, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL) html = re.sub(STYLE_PATTERN, "", html, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL) @@ -54,12 +52,15 @@ class PageReader: ] for element in elements_to_remove: + removed = len(soup.find_all(element)) + if removed: + logger.debug(f"Removed {removed} {element} elements") for tag in soup.find_all(element): tag.decompose() return str(soup) except Exception as e: - logger.error(f"Error cleaning HTML: {e}") + logger.error(f"Error cleaning HTML: {e}", exc_info=True) return "" def clean_whitespace(self, text: str) -> str: @@ -80,11 +81,17 @@ class PageReader: def html_to_markdown(self, html: str) -> Optional[str]: """Convert HTML to markdown.""" try: + logger.info("Starting HTML to Markdown conversion") + logger.debug(f"Input HTML length: {len(html)}") + cleaned_html = self.clean_html(html) + logger.debug(f"Cleaned HTML length: {len(cleaned_html)}") + if not cleaned_html: + logger.warning("No cleaned HTML content") return None - return self.clean_whitespace(md(cleaned_html, + markdown = self.clean_whitespace(md(cleaned_html, heading_style="ATX", bullets="-", autolinks=True, @@ -92,10 +99,19 @@ class PageReader: escape_asterisks=True, escape_underscores=True)) + logger.debug(f"Generated markdown length: {len(markdown) if markdown else 0}") + + if not markdown or markdown.isspace(): + logger.warning("Markdown is empty or whitespace only") + return None + + return markdown + except Exception as e: - logger.error(f"Error converting to markdown: {e}") + logger.error("Error converting to markdown", exc_info=True) return None - def close(self): + async def close(self): """Cleanup resources""" - self.db.close() \ No newline at end of file + logger.info("Closing PageReader") + pass # No need to close DB connection anymore \ No newline at end of file diff --git a/app/scheduler.py b/app/scheduler.py index d80d79c..a730888 100644 --- a/app/scheduler.py +++ b/app/scheduler.py @@ -7,6 +7,9 @@ from .page_reader import PageReader from sqlalchemy import func from sqlalchemy.orm import Session import pytz +from .config import Config +from .database import get_db +from urllib.parse import urlparse class HistoryScheduler: def __init__(self): @@ -14,6 +17,7 @@ class HistoryScheduler: self.page_reader = PageReader() self.last_history_update = None self.content_update_interval = timedelta(hours=24) # Update content daily + self.config = Config() def _normalize_datetime(self, dt: datetime) -> datetime: """Convert datetime to UTC if it has timezone, or make it timezone-aware if it doesn't""" @@ -28,81 +32,70 @@ class HistoryScheduler: return dt.astimezone(pytz.UTC) async def update_bookmarks(self): - bookmarks = self.browser_collector.fetch_bookmarks() - - db = SessionLocal() + """Update bookmarks from browser""" try: - # First, get all existing URLs to avoid duplicates - existing_urls = { - url: (added_time, folder) - for url, added_time, folder in - db.query(Bookmark.url, Bookmark.added_time, Bookmark.folder).all() - } + db = next(get_db()) + bookmarks = self.browser_collector.fetch_bookmarks() + + for added_time, url, title, folder in bookmarks: # Unpack the tuple + # Extract domain and check if it should be ignored + domain = urlparse(url).netloc + if self.config.is_domain_ignored(domain): + continue - new_entries = [] - for added_time, url, title, folder in bookmarks: # Normalize the datetime added_time = self._normalize_datetime(added_time) - # Only add if URL doesn't exist or if it's in a different folder - if (url not in existing_urls or - existing_urls[url][1] != folder): - domain = self.browser_collector.get_domain(url) - entry = Bookmark( - url=url, - title=title, - added_time=added_time, - folder=folder, - domain=domain - ) - new_entries.append(entry) + # Process the bookmark only if domain is not ignored + bookmark_entry = Bookmark( + url=url, + title=title, + added_time=added_time, + folder=folder, + domain=domain + ) + db.add(bookmark_entry) - if new_entries: - db.bulk_save_objects(new_entries) - db.commit() + db.commit() + + except Exception as e: + print(f"Error updating bookmarks: {e}") finally: db.close() async def update_history(self): + """Background task to update history periodically""" while True: - db = SessionLocal() try: - # Get the latest timestamp from our database - latest_entry = db.query(func.max(HistoryEntry.visit_time)).scalar() - if latest_entry: - latest_entry = self._normalize_datetime(latest_entry) + db = next(get_db()) + history_entries = self.browser_collector.fetch_history() - # Fetch new history - history = self.browser_collector.fetch_history() + for visit_time, url, title in history_entries: # Unpack the tuple + # Extract domain and check if it should be ignored + domain = urlparse(url).netloc + if self.config.is_domain_ignored(domain): + continue - # Filter to only get entries newer than our latest entry - new_entries = [] - for visit_time, url, title in history: # Normalize the datetime visit_time = self._normalize_datetime(visit_time) - if not latest_entry or visit_time > latest_entry: - domain = self.browser_collector.get_domain(url) - entry = HistoryEntry( - url=url, - title=title, - visit_time=visit_time, - domain=domain - ) - new_entries.append(entry) + # Process the entry only if domain is not ignored + history_entry = HistoryEntry( + url=url, + title=title, + visit_time=visit_time, + domain=domain + ) + db.add(history_entry) - if new_entries: - db.bulk_save_objects(new_entries) - db.commit() - - # Update bookmarks - await self.update_bookmarks() + db.commit() + except Exception as e: + print(f"Error updating history: {e}") finally: db.close() - # Wait for 5 minutes before next update - await asyncio.sleep(300) + await asyncio.sleep(300) # Wait 5 minutes before next update async def close(self): """Cleanup resources""" diff --git a/extension/background.js b/extension/background.js index c9cdb23..030187c 100644 --- a/extension/background.js +++ b/extension/background.js @@ -1,5 +1,82 @@ console.log("Background script loaded"); +class WebSocketClient { + constructor() { + console.log("WebSocketClient constructor called"); + this.messageQueue = []; + this.connect(); + this.reconnectAttempts = 0; + this.maxReconnectAttempts = 5; + } + + connect() { + console.log('Attempting to connect to WebSocket server...'); + try { + this.ws = new WebSocket('ws://localhost:8523/ws'); + console.log('WebSocket instance created'); + + this.ws.addEventListener('open', () => { + console.log('WebSocket connection opened successfully'); + this.reconnectAttempts = 0; + this.processQueue(); + }); + + this.ws.addEventListener('error', (event) => { + console.error('WebSocket error occurred:', event); + }); + + this.ws.addEventListener('close', (event) => { + console.log('WebSocket connection closed:', event.code, event.reason); + this.tryReconnect(); + }); + + this.ws.addEventListener('message', (event) => { + console.log('Received message from server:', event.data); + }); + } catch (error) { + console.error('Error creating WebSocket:', error); + } + } + + processQueue() { + console.log(`Processing message queue (${this.messageQueue.length} messages)`); + while (this.messageQueue.length > 0) { + const data = this.messageQueue.shift(); + this.sendMessage(data); + } + } + + tryReconnect() { + if (this.reconnectAttempts < this.maxReconnectAttempts) { + this.reconnectAttempts++; + console.log(`Attempting to reconnect (${this.reconnectAttempts}/${this.maxReconnectAttempts})...`); + setTimeout(() => this.connect(), 2000 * this.reconnectAttempts); + } else { + console.log('Max reconnection attempts reached'); + } + } + + sendMessage(data) { + if (this.ws.readyState === WebSocket.OPEN) { + try { + console.log('Sending data for URL:', data.url); + this.ws.send(JSON.stringify(data)); + console.log('Data sent successfully'); + return true; + } catch (error) { + console.error('Error sending data:', error); + return false; + } + } else { + console.log('WebSocket not ready, queueing message'); + this.messageQueue.push(data); + return true; + } + } +} + +const wsClient = new WebSocketClient(); + async function isContentScriptReady(tabId) { try { await browser.tabs.sendMessage(tabId, { type: "PING" }); @@ -38,9 +115,17 @@ async function sendMessageToTab(tabId) { } } +// Listen for messages from content scripts +browser.runtime.onMessage.addListener((message, sender) => { + if (message.type === "SEND_PAGE_CONTENT") { + console.log('Received page content from tab:', sender.tab.id); + wsClient.sendMessage(message.data); + } +}); + browser.webNavigation.onCompleted.addListener(async (details) => { console.log("Navigation completed", details); - if (details.frameId === 0) { // Only handle main frame navigation + if (details.frameId === 0) { console.log(`Main frame navigation detected for tab ${details.tabId}`); await sendMessageToTab(details.tabId); } diff --git a/extension/content.js b/extension/content.js index f669d75..cb8da49 100644 --- a/extension/content.js +++ b/extension/content.js @@ -1,132 +1,32 @@ console.log("Content script starting initialization..."); -// Function to log WebSocket state -function getWebSocketState(ws) { - const states = { - 0: 'CONNECTING', - 1: 'OPEN', - 2: 'CLOSING', - 3: 'CLOSED' +function sendPageContent() { + const pageContent = { + url: window.location.href, + html: document.documentElement.outerHTML, + timestamp: new Date().toISOString().replace(/\.\d{3}Z$/, 'Z') }; - return states[ws.readyState] || 'UNKNOWN'; + + browser.runtime.sendMessage({ + type: "SEND_PAGE_CONTENT", + data: pageContent + }); } -class WebSocketClient { - constructor() { - console.log("WebSocketClient constructor called"); - this.messageQueue = []; - this.connect(); - this.reconnectAttempts = 0; - this.maxReconnectAttempts = 5; - } - - connect() { - console.log('Attempting to connect to WebSocket server...'); - try { - this.ws = new WebSocket('ws://localhost:8523/ws'); - console.log('WebSocket instance created'); - - this.ws.addEventListener('open', () => { - console.log('WebSocket connection opened successfully'); - this.reconnectAttempts = 0; - // Process any queued messages - this.processQueue(); - }); - - this.ws.addEventListener('error', (event) => { - console.error('WebSocket error occurred:', event); - }); - - this.ws.addEventListener('close', (event) => { - console.log('WebSocket connection closed:', event.code, event.reason); - this.tryReconnect(); - }); - - this.ws.addEventListener('message', (event) => { - console.log('Received message from server:', event.data); - }); - } catch (error) { - console.error('Error creating WebSocket:', error); - } - } - - processQueue() { - console.log(`Processing message queue (${this.messageQueue.length} messages)`); - while (this.messageQueue.length > 0) { - const data = this.messageQueue.shift(); - this.sendMessage(data); - } - } - - tryReconnect() { - if (this.reconnectAttempts < this.maxReconnectAttempts) { - this.reconnectAttempts++; - console.log(`Attempting to reconnect (${this.reconnectAttempts}/${this.maxReconnectAttempts})...`); - setTimeout(() => this.connect(), 2000 * this.reconnectAttempts); - } else { - console.log('Max reconnection attempts reached'); - } - } - - sendMessage(data) { - console.log('sendMessage called, WebSocket state:', getWebSocketState(this.ws)); - if (this.ws.readyState === WebSocket.OPEN) { - try { - console.log('Preparing to send data:', { - url: data.url, - timestamp: data.timestamp, - htmlLength: data.html.length - }); - this.ws.send(JSON.stringify(data)); - console.log('Data sent successfully'); - return true; - } catch (error) { - console.error('Error sending data:', error); - return false; - } - } else { - console.log('WebSocket not ready, queueing message'); - this.messageQueue.push(data); - return true; - } - } -} - -console.log("Creating WebSocketClient instance..."); -const wsClient = new WebSocketClient(); - -console.log("Setting up message listener..."); +// Listen for messages from the background script browser.runtime.onMessage.addListener((message, sender, sendResponse) => { - console.log('Message received from background script:', message); - if (message.type === "PING") { - console.log('Received PING, responding...'); return Promise.resolve({ status: "ready" }); } if (message.type === "GET_PAGE_CONTENT") { - console.log('Processing GET_PAGE_CONTENT message'); - const pageContent = { - url: window.location.href, - html: document.documentElement.outerHTML, - timestamp: new Date().toISOString().replace(/\.\d{3}Z$/, 'Z') - }; - - console.log('Created page content object for:', pageContent.url); - wsClient.sendMessage(pageContent); + sendPageContent(); } return true; }); // Send initial page content -console.log('Sending initial page content...'); -const pageContent = { - url: window.location.href, - html: document.documentElement.outerHTML, - timestamp: new Date().toISOString().replace(/\.\d{3}Z$/, 'Z') -}; - -wsClient.sendMessage(pageContent); +sendPageContent(); console.log("Content script initialization complete for:", window.location.href); \ No newline at end of file diff --git a/page-reader.py b/page-reader.py deleted file mode 100644 index ae83191..0000000 --- a/page-reader.py +++ /dev/null @@ -1,84 +0,0 @@ -import httpx -import re -from markdownify import markdownify as md -from bs4 import BeautifulSoup - -# Patterns for cleaning -SCRIPT_PATTERN = r"<[ ]*script.*?\/[ ]*script[ ]*>" -STYLE_PATTERN = r"<[ ]*style.*?\/[ ]*style[ ]*>" -META_PATTERN = r"<[ ]*meta.*?>" -COMMENT_PATTERN = r"<[ ]*!--.*?--[ ]*>" -LINK_PATTERN = r"<[ ]*link.*?>" -BASE64_IMG_PATTERN = r']+src="data:image/[^;]+;base64,[^"]+"[^>]*>' -SVG_PATTERN = r"(]*>)(.*?)(<\/svg>)" - -def clean_html(html: str) -> str: - """Clean HTML by removing unwanted elements and patterns.""" - # First use regex to remove problematic patterns - html = re.sub(SCRIPT_PATTERN, "", html, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL) - html = re.sub(STYLE_PATTERN, "", html, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL) - html = re.sub(META_PATTERN, "", html, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL) - html = re.sub(COMMENT_PATTERN, "", html, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL) - html = re.sub(LINK_PATTERN, "", html, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL) - html = re.sub(SVG_PATTERN, "", html, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL) - html = re.sub(BASE64_IMG_PATTERN, "", html) - - # Use BeautifulSoup to remove additional elements we want to strip - soup = BeautifulSoup(html, 'html.parser') - - # Remove unwanted elements - elements_to_remove = [ - 'canvas', 'img', 'picture', 'audio', 'video', - 'iframe', 'embed', 'object', 'param', 'track', - 'map', 'area', 'source' - ] - - for element in elements_to_remove: - for tag in soup.find_all(element): - tag.decompose() - - return str(soup) - -def get_page_html(url: str) -> str: - """Fetch HTML content from a given URL using httpx.""" - headers = { - "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36" - } - try: - with httpx.Client(follow_redirects=True) as client: - response = client.get(url, headers=headers) - response.raise_for_status() - return response.text - except httpx.HTTPError as e: - print(f"Error fetching page: {e}") - return "" - -def clean_whitespace(text: str) -> str: - """Clean excessive whitespace from text, collapsing more than 2 newlines.""" - # Replace 3 or more newlines with 2 newlines - cleaned = re.sub(r'\n{3,}', '\n\n', text) - # Remove trailing whitespace from each line - cleaned = '\n'.join(line.rstrip() for line in cleaned.splitlines()) - return cleaned.strip() - -def html_to_markdown(url: str) -> str: - """Convert webpage HTML to markdown.""" - html = get_page_html(url) - if not html: - return "" - - # Clean the HTML first - cleaned_html = clean_html(html) - - # Convert to markdown using markdownify - # Configure markdownify options for clean output - markdown = md(cleaned_html, - heading_style="ATX", # Use # style headers - bullets="-", # Use - for bullets - autolinks=True, # Convert URLs to links - strip=['form'], # Additional elements to strip - escape_asterisks=True, - escape_underscores=True) - - # Clean up excessive whitespace - return clean_whitespace(markdown) diff --git a/requirements.txt b/requirements.txt index eb88162..58ed474 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,9 +2,10 @@ fastapi uvicorn sqlalchemy browser-history -beautifulsoup4 +beautifulsoup4>=4.9.3 markdownify -pyyaml +pyyaml>=6.0.1 pytz websockets==11.0.3 -iso8601==2.1.0 \ No newline at end of file +iso8601==2.1.0 +lxml>=4.9.3 \ No newline at end of file