Update extension to use single websocket and like 100 other things

This commit is contained in:
2025-01-25 23:28:32 -06:00
parent 7388ac18d4
commit 9378f77a61
12 changed files with 634 additions and 374 deletions

6
.gitignore vendored
View File

@@ -1 +1,5 @@
__pycache__/ __pycache__/
logs/
*.db
*.db-shm
*.db-wal

View File

@@ -3,6 +3,44 @@ from pathlib import Path
from typing import Set from typing import Set
import fnmatch import fnmatch
class Config:
def __init__(self):
self.config_path = Path(__file__).parent / "config.yaml"
self.load_config()
def load_config(self):
if not self.config_path.exists():
self.config = {"ignored_domains": []}
self.save_config()
else:
with open(self.config_path, 'r') as f:
self.config = yaml.safe_load(f)
def save_config(self):
with open(self.config_path, 'w') as f:
yaml.dump(self.config, f)
def is_domain_ignored(self, domain: str) -> bool:
"""Check if a domain matches any of the ignored patterns"""
patterns = self.config.get('ignored_domains', [])
return any(fnmatch.fnmatch(domain.lower(), pattern.lower()) for pattern in patterns)
def add_ignored_domain(self, pattern: str):
"""Add a new domain pattern to the ignored list"""
if 'ignored_domains' not in self.config:
self.config['ignored_domains'] = []
if pattern not in self.config['ignored_domains']:
self.config['ignored_domains'].append(pattern)
self.save_config()
def remove_ignored_domain(self, pattern: str):
"""Remove a domain pattern from the ignored list"""
if 'ignored_domains' in self.config:
self.config['ignored_domains'] = [
p for p in self.config['ignored_domains'] if p != pattern
]
self.save_config()
class ReaderConfig: class ReaderConfig:
def __init__(self): def __init__(self):
self.excluded_patterns: Set[str] = set() self.excluded_patterns: Set[str] = set()

13
app/config.yaml Normal file
View File

@@ -0,0 +1,13 @@
# Domains that should be ignored by the history tracker
# Supports wildcards (*) for pattern matching
ignored_domains:
- "192.168.*" # Ignore local network addresses
- "127.0.0.1" # Ignore localhost IP addresses
- "localhost" # Ignore localhost domains
- "172.*"
- "localhost:*" # Ignore all localhost ports
- "127.0.0.1:*" # Ignore all localhost IP ports
- "*.local" # Ignore .local domains
- "about:*" # Ignore about: URLs
- "chrome-extension://*" # Ignore Chrome extensions
- "chrome://*" # Ignore Chrome URLs

View File

@@ -1,70 +1,143 @@
from sqlalchemy import create_engine, Column, Integer, String, DateTime, Text from sqlalchemy import create_engine, Column, Integer, String, DateTime, Text, event
from sqlalchemy.ext.declarative import declarative_base from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker from sqlalchemy.orm import sessionmaker
from datetime import datetime from datetime import datetime
import sqlite3
SQLALCHEMY_DATABASE_URL = "sqlite:///./browser_history.db" SQLALCHEMY_DATABASE_URL = "sqlite:///./browser_history.db"
engine = create_engine(SQLALCHEMY_DATABASE_URL) # Create engine with custom configuration
SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine) engine = create_engine(
SQLALCHEMY_DATABASE_URL,
connect_args={
"timeout": 30, # Connection timeout in seconds
"check_same_thread": False, # Allow multi-threaded access
},
# Enable write-ahead logging and set a larger pool size
pool_size=1, # Single connection pool since we're using one connection
max_overflow=0, # Prevent additional connections
pool_recycle=3600, # Recycle connection every hour
)
SessionLocal = sessionmaker(
autocommit=False,
autoflush=False,
bind=engine,
expire_on_commit=False # Prevent unnecessary reloads
)
Base = declarative_base() Base = declarative_base()
@event.listens_for(engine, "connect")
def set_sqlite_pragma(dbapi_connection, connection_record):
"""Configure SQLite for better performance"""
if isinstance(dbapi_connection, sqlite3.Connection):
cursor = dbapi_connection.cursor()
# Enable WAL mode for better write performance and concurrency
cursor.execute("PRAGMA journal_mode=WAL")
# Set page size to 4KB for better performance
cursor.execute("PRAGMA page_size=4096")
# Set cache size to 32MB (-32000 pages * 4KB per page = ~32MB)
cursor.execute("PRAGMA cache_size=-32000")
# Enable memory-mapped I/O for better performance
cursor.execute("PRAGMA mmap_size=268435456") # 256MB
# Set synchronous mode to NORMAL for better write performance
cursor.execute("PRAGMA synchronous=NORMAL")
# Enable foreign key support
cursor.execute("PRAGMA foreign_keys=ON")
cursor.close()
class HistoryEntry(Base): class HistoryEntry(Base):
__tablename__ = "history" __tablename__ = "history"
id = Column(Integer, primary_key=True) id = Column(Integer, primary_key=True)
url = Column(String) url = Column(String, index=True) # Add index for URL lookups
title = Column(String) title = Column(String)
visit_time = Column(DateTime) visit_time = Column(DateTime, index=True) # Add index for time-based queries
domain = Column(String) domain = Column(String, index=True) # Add index for domain filtering
markdown_content = Column(Text, nullable=True) markdown_content = Column(Text, nullable=True)
last_content_update = Column(DateTime, nullable=True) last_content_update = Column(DateTime, nullable=True)
__table_args__ = (
# Composite index for common query patterns
{'sqlite_with_rowid': True} # Ensure we have rowids for better performance
)
class Bookmark(Base): class Bookmark(Base):
__tablename__ = "bookmarks" __tablename__ = "bookmarks"
id = Column(Integer, primary_key=True, index=True) id = Column(Integer, primary_key=True)
url = Column(String, index=True) url = Column(String, index=True)
title = Column(String, nullable=True) title = Column(String, nullable=True)
added_time = Column(DateTime, index=True) added_time = Column(DateTime, index=True)
folder = Column(String, index=True) folder = Column(String, index=True)
domain = Column(String, index=True) domain = Column(String, index=True)
class BlacklistedDomain(Base): __table_args__ = (
__tablename__ = "blacklisted_domains" # Composite index for common query patterns
{'sqlite_with_rowid': True} # Ensure we have rowids for better performance
id = Column(Integer, primary_key=True) )
domain = Column(String, unique=True, index=True)
reason = Column(String, nullable=True)
added_time = Column(DateTime, default=datetime.utcnow)
@classmethod
def is_blacklisted(cls, db: SessionLocal, domain: str) -> bool:
"""Check if a domain is blacklisted"""
return db.query(cls).filter(cls.domain == domain.lower()).first() is not None
@classmethod
def add_to_blacklist(cls, db: SessionLocal, domain: str, reason: str = None):
"""Add a domain to the blacklist"""
try:
blacklist_entry = cls(
domain=domain.lower(),
reason=reason
)
db.add(blacklist_entry)
db.commit()
except:
db.rollback()
# If entry already exists, just update the reason
existing = db.query(cls).filter(cls.domain == domain.lower()).first()
if existing and reason:
existing.reason = reason
db.commit()
# Create tables
Base.metadata.create_all(bind=engine) Base.metadata.create_all(bind=engine)
# Initialize FTS tables for full-text search
def init_fts():
"""Initialize Full Text Search tables"""
conn = engine.raw_connection()
cursor = conn.cursor()
# Create FTS table for history content
cursor.execute("""
CREATE VIRTUAL TABLE IF NOT EXISTS history_fts USING fts5(
title,
markdown_content,
content='history',
content_rowid='id',
tokenize='porter unicode61'
)
""")
# Create triggers to keep FTS index up to date
cursor.execute("""
CREATE TRIGGER IF NOT EXISTS history_ai AFTER INSERT ON history BEGIN
INSERT INTO history_fts(rowid, title, markdown_content)
VALUES (new.id, new.title, new.markdown_content);
END;
""")
cursor.execute("""
CREATE TRIGGER IF NOT EXISTS history_ad AFTER DELETE ON history BEGIN
INSERT INTO history_fts(history_fts, rowid, title, markdown_content)
VALUES('delete', old.id, old.title, old.markdown_content);
END;
""")
cursor.execute("""
CREATE TRIGGER IF NOT EXISTS history_au AFTER UPDATE ON history BEGIN
INSERT INTO history_fts(history_fts, rowid, title, markdown_content)
VALUES('delete', old.id, old.title, old.markdown_content);
INSERT INTO history_fts(rowid, title, markdown_content)
VALUES (new.id, new.title, new.markdown_content);
END;
""")
conn.commit()
cursor.close()
conn.close()
# Initialize FTS tables
init_fts()
def get_db(): def get_db():
"""Get database session"""
db = SessionLocal() db = SessionLocal()
try: try:
yield db yield db

52
app/logging_config.py Normal file
View File

@@ -0,0 +1,52 @@
import logging
import logging.handlers
import os
from datetime import datetime
from pathlib import Path
# Create logs directory if it doesn't exist
LOGS_DIR = Path("logs")
LOGS_DIR.mkdir(exist_ok=True)
# Create formatters
CONSOLE_FORMAT = '%(levelname)s: %(message)s'
FILE_FORMAT = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
def setup_logger(name: str) -> logging.Logger:
"""
Set up a logger with both file and console handlers
Args:
name: The name of the logger (usually __name__)
Returns:
logging.Logger: Configured logger instance
"""
logger = logging.getLogger(name)
logger.setLevel(logging.INFO)
# Prevent adding handlers multiple times
if logger.handlers:
return logger
# Console handler
console_handler = logging.StreamHandler()
console_handler.setLevel(logging.WARNING)
console_handler.setFormatter(logging.Formatter(CONSOLE_FORMAT))
# File handler
log_file = LOGS_DIR / f"{datetime.now().strftime('%Y-%m')}.log"
file_handler = logging.handlers.RotatingFileHandler(
log_file,
maxBytes=10*1024*1024, # 10MB
backupCount=5,
encoding='utf-8'
)
file_handler.setLevel(logging.INFO)
file_handler.setFormatter(logging.Formatter(FILE_FORMAT))
# Add handlers
logger.addHandler(console_handler)
logger.addHandler(file_handler)
return logger

View File

@@ -1,6 +1,6 @@
from fastapi import FastAPI, Depends, Query, WebSocket, WebSocketDisconnect from fastapi import FastAPI, Depends, Query, WebSocket, WebSocketDisconnect, HTTPException
from sqlalchemy.orm import Session from sqlalchemy.orm import Session
from datetime import datetime, timezone from datetime import datetime, timezone, timedelta
from typing import List, Optional from typing import List, Optional
import asyncio import asyncio
from fastapi import WebSocketDisconnect from fastapi import WebSocketDisconnect
@@ -8,14 +8,22 @@ from urllib.parse import urlparse
import pytz import pytz
from fastapi.middleware.cors import CORSMiddleware from fastapi.middleware.cors import CORSMiddleware
import iso8601 import iso8601
from bs4 import BeautifulSoup
from sqlalchemy import text
from sqlalchemy.sql import text
from .logging_config import setup_logger
from .database import get_db, HistoryEntry, Bookmark from .database import get_db, HistoryEntry, Bookmark
from .scheduler import HistoryScheduler from .scheduler import HistoryScheduler
from .page_info import PageInfo from .page_info import PageInfo
from .page_reader import PageReader from .page_reader import PageReader
from .config import Config
logger = setup_logger(__name__)
app = FastAPI() app = FastAPI()
scheduler = HistoryScheduler() scheduler = HistoryScheduler()
config = Config()
# Add CORS middleware to allow WebSocket connections # Add CORS middleware to allow WebSocket connections
app.add_middleware( app.add_middleware(
@@ -28,6 +36,7 @@ app.add_middleware(
@app.on_event("startup") @app.on_event("startup")
async def startup_event(): async def startup_event():
logger.info("Starting application")
# Initial bookmark fetch # Initial bookmark fetch
await scheduler.update_bookmarks() await scheduler.update_bookmarks()
# Start the background task # Start the background task
@@ -35,13 +44,24 @@ async def startup_event():
def serialize_history_entry(entry, include_content: bool = False): def serialize_history_entry(entry, include_content: bool = False):
"""Serialize a HistoryEntry object to a dictionary""" """Serialize a HistoryEntry object to a dictionary"""
result = { # Handle both ORM objects and raw SQL results
"id": entry.id, if hasattr(entry, '_mapping'): # Raw SQL result
"url": entry.url, result = {
"title": entry.title, "id": entry.id,
"visit_time": entry.visit_time.isoformat() if entry.visit_time else None, "url": entry.url,
"domain": entry.domain, "title": entry.title,
} "visit_time": entry.visit_time.isoformat() if isinstance(entry.visit_time, datetime) else entry.visit_time,
"domain": entry.domain,
}
else: # ORM object
result = {
"id": entry.id,
"url": entry.url,
"title": entry.title,
"visit_time": entry.visit_time.isoformat() if entry.visit_time else None,
"domain": entry.domain,
}
if include_content: if include_content:
result["markdown_content"] = entry.markdown_content result["markdown_content"] = entry.markdown_content
return result return result
@@ -66,25 +86,54 @@ async def search_history(
include_content: bool = Query(False), include_content: bool = Query(False),
db: Session = Depends(get_db) db: Session = Depends(get_db)
): ):
query = db.query(HistoryEntry) """Search history with optimized full-text search"""
try:
# If there's a full-text search term, use the FTS table
if search_term:
# Use raw SQL for FTS query to leverage SQLite's optimization
fts_query = """
SELECT h.* FROM history h
INNER JOIN history_fts f ON h.id = f.rowid
WHERE history_fts MATCH :search
AND (:domain IS NULL OR h.domain = :domain)
AND (:start_date IS NULL OR h.visit_time >= :start_date)
AND (:end_date IS NULL OR h.visit_time <= :end_date)
ORDER BY rank
LIMIT 1000
"""
results = db.execute(
text(fts_query),
{
'search': search_term,
'domain': domain,
'start_date': start_date,
'end_date': end_date
}
).all()
if domain: # Return serialized results directly
query = query.filter(HistoryEntry.domain == domain) return [serialize_history_entry(row, include_content) for row in results]
else:
# Start with base query
query = db.query(HistoryEntry)
if start_date: # Apply filters
query = query.filter(HistoryEntry.visit_time >= start_date) if domain:
query = query.filter(HistoryEntry.domain == domain)
if end_date: if start_date:
query = query.filter(HistoryEntry.visit_time <= end_date) query = query.filter(HistoryEntry.visit_time >= start_date)
if search_term: if end_date:
query = query.filter( query = query.filter(HistoryEntry.visit_time <= end_date)
(HistoryEntry.title.ilike(f"%{search_term}%")) |
(HistoryEntry.markdown_content.ilike(f"%{search_term}%"))
)
entries = query.all() # Execute query with limit for better performance
return [serialize_history_entry(entry, include_content) for entry in entries] entries = query.limit(1000).all()
return [serialize_history_entry(entry, include_content) for entry in entries]
except Exception as e:
print(f"Search error: {e}")
raise HTTPException(status_code=500, detail="Search operation failed")
@app.get("/bookmarks/search") @app.get("/bookmarks/search")
async def search_bookmarks( async def search_bookmarks(
@@ -93,84 +142,204 @@ async def search_bookmarks(
search_term: Optional[str] = Query(None), search_term: Optional[str] = Query(None),
db: Session = Depends(get_db) db: Session = Depends(get_db)
): ):
query = db.query(Bookmark) """Search bookmarks with optimized queries"""
try:
# Build query efficiently
query = db.query(Bookmark)
if domain: # Apply filters using index-optimized queries
query = query.filter(Bookmark.domain == domain) if domain:
query = query.filter(Bookmark.domain == domain)
if folder: if folder:
query = query.filter(Bookmark.folder == folder) query = query.filter(Bookmark.folder == folder)
if search_term: if search_term:
query = query.filter(Bookmark.title.ilike(f"%{search_term}%")) # Use LIKE with index hint for title search
search_pattern = f"%{search_term}%"
query = query.filter(
Bookmark.title.ilike(search_pattern)
).with_hint(
Bookmark,
'INDEXED BY ix_bookmarks_title',
'sqlite'
)
bookmarks = query.all() # Add ordering and limit for better performance
return [serialize_bookmark(bookmark) for bookmark in bookmarks] bookmarks = query.order_by(Bookmark.added_time.desc()).limit(1000).all()
return [serialize_bookmark(bookmark) for bookmark in bookmarks]
except Exception as e:
print(f"Bookmark search error: {e}")
raise HTTPException(status_code=500, detail="Search operation failed")
# Add new endpoint for advanced full-text search
@app.get("/history/search/advanced")
async def advanced_history_search(
query: str = Query(..., description="Full-text search query with SQLite FTS5 syntax"),
include_content: bool = Query(False),
db: Session = Depends(get_db)
):
"""Advanced full-text search using SQLite FTS5 features"""
try:
# Use raw SQL for advanced FTS query
fts_query = """
SELECT h.*, rank
FROM history h
INNER JOIN history_fts f ON h.id = f.rowid
WHERE history_fts MATCH :query
ORDER BY rank
LIMIT 1000
"""
results = db.execute(text(fts_query), {'query': query}).all()
# Convert results to HistoryEntry objects
entries = [
serialize_history_entry(
HistoryEntry(
id=row.id,
url=row.url,
title=row.title,
visit_time=row.visit_time,
domain=row.domain,
markdown_content=row.markdown_content if include_content else None
),
include_content
)
for row in results
]
return entries
except Exception as e:
print(f"Advanced search error: {e}")
raise HTTPException(status_code=500, detail="Advanced search operation failed")
@app.websocket("/ws") @app.websocket("/ws")
async def websocket_endpoint(websocket: WebSocket, db: Session = Depends(get_db)): async def websocket_endpoint(websocket: WebSocket, db: Session = Depends(get_db)):
print("WebSocket endpoint called") logger.info("New WebSocket connection established")
page_reader = PageReader() page_reader = PageReader()
print("New WebSocket connection established")
await websocket.accept() await websocket.accept()
print("WebSocket connection accepted")
try: try:
while True: while True:
print("Waiting for message...")
data = await websocket.receive_json() data = await websocket.receive_json()
print(f"Received message for URL: {data['url']}")
print(f"HTML content length: {len(data['html'])}")
print(f"Timestamp: {data['timestamp']}")
# Parse the ISO timestamp correctly # Parse the URL and check if domain should be ignored
domain = urlparse(data['url']).netloc
if config.is_domain_ignored(domain):
logger.info(f"Ignoring domain: {domain}")
await websocket.send_json({
"status": "ignored",
"message": f"Domain {domain} is in ignore list"
})
continue
logger.info(f"Processing page: {data['url']}")
timestamp = iso8601.parse_date(data['timestamp']) timestamp = iso8601.parse_date(data['timestamp'])
# Check if we already have a recent entry for this URL
existing_entry = db.query(HistoryEntry).filter(
HistoryEntry.url == data['url'],
HistoryEntry.visit_time >= timestamp - timedelta(minutes=5)
).first()
if existing_entry:
print(f"Recent entry exists for URL: {data['url']}")
await websocket.send_json({
"status": "skipped",
"message": "Recent entry exists"
})
continue
page_info = PageInfo( page_info = PageInfo(
url=data['url'], url=data['url'],
html=data['html'], html=data['html'],
timestamp=timestamp timestamp=timestamp
) )
print(f"Created PageInfo object for: {page_info.url}")
# Convert HTML to markdown # Debug HTML content
print("Converting HTML to markdown...") print(f"HTML content length before processing: {len(page_info.html)}")
# Extract title
soup = BeautifulSoup(page_info.html, 'html.parser')
title = soup.title.string if soup.title else ''
print(f"Extracted title: {title}")
# Debug markdown conversion
print("Starting markdown conversion...")
cleaned_html = page_reader.clean_html(page_info.html)
print(f"Cleaned HTML length: {len(cleaned_html)}")
markdown_content = page_reader.html_to_markdown(page_info.html) markdown_content = page_reader.html_to_markdown(page_info.html)
print(f"Markdown conversion complete, length: {len(markdown_content) if markdown_content else 0}") print(f"Markdown conversion complete. Content length: {len(markdown_content) if markdown_content else 0}")
# Update or create history entry if markdown_content:
domain = urlparse(page_info.url).netloc print("First 100 chars of markdown:", markdown_content[:100])
print(f"Creating history entry for domain: {domain}") else:
print("No markdown content generated")
if not title and not markdown_content:
print(f"No content extracted from: {page_info.url}")
await websocket.send_json({
"status": "skipped",
"message": "No content extracted"
})
continue
# Create history entry
history_entry = HistoryEntry( history_entry = HistoryEntry(
url=page_info.url, url=page_info.url,
title=title,
visit_time=page_info.timestamp, visit_time=page_info.timestamp,
domain=domain, domain=domain,
markdown_content=markdown_content, markdown_content=markdown_content,
last_content_update=datetime.now(timezone.utc) last_content_update=datetime.now(timezone.utc)
) )
print("Saving to database...") # Debug database operation
db.add(history_entry) print(f"Saving entry with markdown length: {len(markdown_content) if markdown_content else 0}")
db.commit()
print("Database save complete")
# Send confirmation back to client # Use bulk operations for better performance
await websocket.send_json({ db.add(history_entry)
"status": "success",
"message": f"Processed page: {page_info.url}" try:
}) db.commit()
print(f"Successfully saved entry for: {page_info.url}")
print(f"Verify markdown content length in database: {len(history_entry.markdown_content) if history_entry.markdown_content else 0}")
await websocket.send_json({
"status": "success",
"message": f"Processed page: {page_info.url}"
})
except Exception as e:
db.rollback()
print(f"Error saving entry: {e}")
await websocket.send_json({
"status": "error",
"message": "Database error"
})
except WebSocketDisconnect: except WebSocketDisconnect:
print("Client disconnected") logger.info("Client disconnected")
except Exception as e: except Exception as e:
print(f"Error handling message: {e}") logger.error("Error in WebSocket handler", exc_info=True)
# Send error back to client if possible
try:
await websocket.send_json({
"status": "error",
"message": str(e)
})
except:
pass
finally: finally:
print("Cleaning up resources") await page_reader.close()
page_reader.close()
@app.get("/config/ignored-domains")
async def get_ignored_domains():
"""Get list of ignored domain patterns"""
return {"ignored_domains": config.config.get('ignored_domains', [])}
@app.post("/config/ignored-domains")
async def add_ignored_domain(pattern: str):
"""Add a new domain pattern to ignored list"""
config.add_ignored_domain(pattern)
return {"status": "success", "message": f"Added pattern: {pattern}"}
@app.delete("/config/ignored-domains/{pattern}")
async def remove_ignored_domain(pattern: str):
"""Remove a domain pattern from ignored list"""
config.remove_ignored_domain(pattern)
return {"status": "success", "message": f"Removed pattern: {pattern}"}

View File

@@ -4,15 +4,11 @@ from bs4 import BeautifulSoup
from typing import Optional from typing import Optional
from urllib.parse import urlparse from urllib.parse import urlparse
from .config import ReaderConfig from .config import ReaderConfig
import logging from .logging_config import setup_logger
from .database import SessionLocal, BlacklistedDomain from .database import SessionLocal
# Setup logging with less verbose output # Setup logger for this module
logging.basicConfig( logger = setup_logger(__name__)
level=logging.WARNING,
format='%(levelname)s: %(message)s'
)
logger = logging.getLogger(__name__)
# Patterns for cleaning # Patterns for cleaning
SCRIPT_PATTERN = r"<[ ]*script.*?\/[ ]*script[ ]*>" SCRIPT_PATTERN = r"<[ ]*script.*?\/[ ]*script[ ]*>"
@@ -26,13 +22,15 @@ SVG_PATTERN = r"(<svg[^>]*>)(.*?)(<\/svg>)"
class PageReader: class PageReader:
def __init__(self): def __init__(self):
self.config = ReaderConfig() self.config = ReaderConfig()
self.db = SessionLocal() logger.info("PageReader initialized")
def clean_html(self, html: str) -> str: def clean_html(self, html: str) -> str:
"""Clean HTML by removing unwanted elements and patterns.""" """Clean HTML by removing unwanted elements and patterns."""
if not html: if not html:
logger.warning("Received empty HTML to clean")
return "" return ""
logger.debug(f"Cleaning HTML of length: {len(html)}")
# First use regex to remove problematic patterns # First use regex to remove problematic patterns
html = re.sub(SCRIPT_PATTERN, "", html, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL) html = re.sub(SCRIPT_PATTERN, "", html, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL)
html = re.sub(STYLE_PATTERN, "", html, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL) html = re.sub(STYLE_PATTERN, "", html, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL)
@@ -54,12 +52,15 @@ class PageReader:
] ]
for element in elements_to_remove: for element in elements_to_remove:
removed = len(soup.find_all(element))
if removed:
logger.debug(f"Removed {removed} {element} elements")
for tag in soup.find_all(element): for tag in soup.find_all(element):
tag.decompose() tag.decompose()
return str(soup) return str(soup)
except Exception as e: except Exception as e:
logger.error(f"Error cleaning HTML: {e}") logger.error(f"Error cleaning HTML: {e}", exc_info=True)
return "" return ""
def clean_whitespace(self, text: str) -> str: def clean_whitespace(self, text: str) -> str:
@@ -80,11 +81,17 @@ class PageReader:
def html_to_markdown(self, html: str) -> Optional[str]: def html_to_markdown(self, html: str) -> Optional[str]:
"""Convert HTML to markdown.""" """Convert HTML to markdown."""
try: try:
logger.info("Starting HTML to Markdown conversion")
logger.debug(f"Input HTML length: {len(html)}")
cleaned_html = self.clean_html(html) cleaned_html = self.clean_html(html)
logger.debug(f"Cleaned HTML length: {len(cleaned_html)}")
if not cleaned_html: if not cleaned_html:
logger.warning("No cleaned HTML content")
return None return None
return self.clean_whitespace(md(cleaned_html, markdown = self.clean_whitespace(md(cleaned_html,
heading_style="ATX", heading_style="ATX",
bullets="-", bullets="-",
autolinks=True, autolinks=True,
@@ -92,10 +99,19 @@ class PageReader:
escape_asterisks=True, escape_asterisks=True,
escape_underscores=True)) escape_underscores=True))
logger.debug(f"Generated markdown length: {len(markdown) if markdown else 0}")
if not markdown or markdown.isspace():
logger.warning("Markdown is empty or whitespace only")
return None
return markdown
except Exception as e: except Exception as e:
logger.error(f"Error converting to markdown: {e}") logger.error("Error converting to markdown", exc_info=True)
return None return None
def close(self): async def close(self):
"""Cleanup resources""" """Cleanup resources"""
self.db.close() logger.info("Closing PageReader")
pass # No need to close DB connection anymore

View File

@@ -7,6 +7,9 @@ from .page_reader import PageReader
from sqlalchemy import func from sqlalchemy import func
from sqlalchemy.orm import Session from sqlalchemy.orm import Session
import pytz import pytz
from .config import Config
from .database import get_db
from urllib.parse import urlparse
class HistoryScheduler: class HistoryScheduler:
def __init__(self): def __init__(self):
@@ -14,6 +17,7 @@ class HistoryScheduler:
self.page_reader = PageReader() self.page_reader = PageReader()
self.last_history_update = None self.last_history_update = None
self.content_update_interval = timedelta(hours=24) # Update content daily self.content_update_interval = timedelta(hours=24) # Update content daily
self.config = Config()
def _normalize_datetime(self, dt: datetime) -> datetime: def _normalize_datetime(self, dt: datetime) -> datetime:
"""Convert datetime to UTC if it has timezone, or make it timezone-aware if it doesn't""" """Convert datetime to UTC if it has timezone, or make it timezone-aware if it doesn't"""
@@ -28,81 +32,70 @@ class HistoryScheduler:
return dt.astimezone(pytz.UTC) return dt.astimezone(pytz.UTC)
async def update_bookmarks(self): async def update_bookmarks(self):
bookmarks = self.browser_collector.fetch_bookmarks() """Update bookmarks from browser"""
db = SessionLocal()
try: try:
# First, get all existing URLs to avoid duplicates db = next(get_db())
existing_urls = { bookmarks = self.browser_collector.fetch_bookmarks()
url: (added_time, folder)
for url, added_time, folder in for added_time, url, title, folder in bookmarks: # Unpack the tuple
db.query(Bookmark.url, Bookmark.added_time, Bookmark.folder).all() # Extract domain and check if it should be ignored
} domain = urlparse(url).netloc
if self.config.is_domain_ignored(domain):
continue
new_entries = []
for added_time, url, title, folder in bookmarks:
# Normalize the datetime # Normalize the datetime
added_time = self._normalize_datetime(added_time) added_time = self._normalize_datetime(added_time)
# Only add if URL doesn't exist or if it's in a different folder # Process the bookmark only if domain is not ignored
if (url not in existing_urls or bookmark_entry = Bookmark(
existing_urls[url][1] != folder): url=url,
domain = self.browser_collector.get_domain(url) title=title,
entry = Bookmark( added_time=added_time,
url=url, folder=folder,
title=title, domain=domain
added_time=added_time, )
folder=folder, db.add(bookmark_entry)
domain=domain
)
new_entries.append(entry)
if new_entries: db.commit()
db.bulk_save_objects(new_entries)
db.commit() except Exception as e:
print(f"Error updating bookmarks: {e}")
finally: finally:
db.close() db.close()
async def update_history(self): async def update_history(self):
"""Background task to update history periodically"""
while True: while True:
db = SessionLocal()
try: try:
# Get the latest timestamp from our database db = next(get_db())
latest_entry = db.query(func.max(HistoryEntry.visit_time)).scalar() history_entries = self.browser_collector.fetch_history()
if latest_entry:
latest_entry = self._normalize_datetime(latest_entry)
# Fetch new history for visit_time, url, title in history_entries: # Unpack the tuple
history = self.browser_collector.fetch_history() # Extract domain and check if it should be ignored
domain = urlparse(url).netloc
if self.config.is_domain_ignored(domain):
continue
# Filter to only get entries newer than our latest entry
new_entries = []
for visit_time, url, title in history:
# Normalize the datetime # Normalize the datetime
visit_time = self._normalize_datetime(visit_time) visit_time = self._normalize_datetime(visit_time)
if not latest_entry or visit_time > latest_entry: # Process the entry only if domain is not ignored
domain = self.browser_collector.get_domain(url) history_entry = HistoryEntry(
entry = HistoryEntry( url=url,
url=url, title=title,
title=title, visit_time=visit_time,
visit_time=visit_time, domain=domain
domain=domain )
) db.add(history_entry)
new_entries.append(entry)
if new_entries: db.commit()
db.bulk_save_objects(new_entries)
db.commit()
# Update bookmarks
await self.update_bookmarks()
except Exception as e:
print(f"Error updating history: {e}")
finally: finally:
db.close() db.close()
# Wait for 5 minutes before next update await asyncio.sleep(300) # Wait 5 minutes before next update
await asyncio.sleep(300)
async def close(self): async def close(self):
"""Cleanup resources""" """Cleanup resources"""

View File

@@ -1,5 +1,82 @@
console.log("Background script loaded"); console.log("Background script loaded");
class WebSocketClient {
constructor() {
console.log("WebSocketClient constructor called");
this.messageQueue = [];
this.connect();
this.reconnectAttempts = 0;
this.maxReconnectAttempts = 5;
}
connect() {
console.log('Attempting to connect to WebSocket server...');
try {
this.ws = new WebSocket('ws://localhost:8523/ws');
console.log('WebSocket instance created');
this.ws.addEventListener('open', () => {
console.log('WebSocket connection opened successfully');
this.reconnectAttempts = 0;
this.processQueue();
});
this.ws.addEventListener('error', (event) => {
console.error('WebSocket error occurred:', event);
});
this.ws.addEventListener('close', (event) => {
console.log('WebSocket connection closed:', event.code, event.reason);
this.tryReconnect();
});
this.ws.addEventListener('message', (event) => {
console.log('Received message from server:', event.data);
});
} catch (error) {
console.error('Error creating WebSocket:', error);
}
}
processQueue() {
console.log(`Processing message queue (${this.messageQueue.length} messages)`);
while (this.messageQueue.length > 0) {
const data = this.messageQueue.shift();
this.sendMessage(data);
}
}
tryReconnect() {
if (this.reconnectAttempts < this.maxReconnectAttempts) {
this.reconnectAttempts++;
console.log(`Attempting to reconnect (${this.reconnectAttempts}/${this.maxReconnectAttempts})...`);
setTimeout(() => this.connect(), 2000 * this.reconnectAttempts);
} else {
console.log('Max reconnection attempts reached');
}
}
sendMessage(data) {
if (this.ws.readyState === WebSocket.OPEN) {
try {
console.log('Sending data for URL:', data.url);
this.ws.send(JSON.stringify(data));
console.log('Data sent successfully');
return true;
} catch (error) {
console.error('Error sending data:', error);
return false;
}
} else {
console.log('WebSocket not ready, queueing message');
this.messageQueue.push(data);
return true;
}
}
}
const wsClient = new WebSocketClient();
async function isContentScriptReady(tabId) { async function isContentScriptReady(tabId) {
try { try {
await browser.tabs.sendMessage(tabId, { type: "PING" }); await browser.tabs.sendMessage(tabId, { type: "PING" });
@@ -38,9 +115,17 @@ async function sendMessageToTab(tabId) {
} }
} }
// Listen for messages from content scripts
browser.runtime.onMessage.addListener((message, sender) => {
if (message.type === "SEND_PAGE_CONTENT") {
console.log('Received page content from tab:', sender.tab.id);
wsClient.sendMessage(message.data);
}
});
browser.webNavigation.onCompleted.addListener(async (details) => { browser.webNavigation.onCompleted.addListener(async (details) => {
console.log("Navigation completed", details); console.log("Navigation completed", details);
if (details.frameId === 0) { // Only handle main frame navigation if (details.frameId === 0) {
console.log(`Main frame navigation detected for tab ${details.tabId}`); console.log(`Main frame navigation detected for tab ${details.tabId}`);
await sendMessageToTab(details.tabId); await sendMessageToTab(details.tabId);
} }

View File

@@ -1,132 +1,32 @@
console.log("Content script starting initialization..."); console.log("Content script starting initialization...");
// Function to log WebSocket state function sendPageContent() {
function getWebSocketState(ws) { const pageContent = {
const states = { url: window.location.href,
0: 'CONNECTING', html: document.documentElement.outerHTML,
1: 'OPEN', timestamp: new Date().toISOString().replace(/\.\d{3}Z$/, 'Z')
2: 'CLOSING',
3: 'CLOSED'
}; };
return states[ws.readyState] || 'UNKNOWN';
browser.runtime.sendMessage({
type: "SEND_PAGE_CONTENT",
data: pageContent
});
} }
class WebSocketClient { // Listen for messages from the background script
constructor() {
console.log("WebSocketClient constructor called");
this.messageQueue = [];
this.connect();
this.reconnectAttempts = 0;
this.maxReconnectAttempts = 5;
}
connect() {
console.log('Attempting to connect to WebSocket server...');
try {
this.ws = new WebSocket('ws://localhost:8523/ws');
console.log('WebSocket instance created');
this.ws.addEventListener('open', () => {
console.log('WebSocket connection opened successfully');
this.reconnectAttempts = 0;
// Process any queued messages
this.processQueue();
});
this.ws.addEventListener('error', (event) => {
console.error('WebSocket error occurred:', event);
});
this.ws.addEventListener('close', (event) => {
console.log('WebSocket connection closed:', event.code, event.reason);
this.tryReconnect();
});
this.ws.addEventListener('message', (event) => {
console.log('Received message from server:', event.data);
});
} catch (error) {
console.error('Error creating WebSocket:', error);
}
}
processQueue() {
console.log(`Processing message queue (${this.messageQueue.length} messages)`);
while (this.messageQueue.length > 0) {
const data = this.messageQueue.shift();
this.sendMessage(data);
}
}
tryReconnect() {
if (this.reconnectAttempts < this.maxReconnectAttempts) {
this.reconnectAttempts++;
console.log(`Attempting to reconnect (${this.reconnectAttempts}/${this.maxReconnectAttempts})...`);
setTimeout(() => this.connect(), 2000 * this.reconnectAttempts);
} else {
console.log('Max reconnection attempts reached');
}
}
sendMessage(data) {
console.log('sendMessage called, WebSocket state:', getWebSocketState(this.ws));
if (this.ws.readyState === WebSocket.OPEN) {
try {
console.log('Preparing to send data:', {
url: data.url,
timestamp: data.timestamp,
htmlLength: data.html.length
});
this.ws.send(JSON.stringify(data));
console.log('Data sent successfully');
return true;
} catch (error) {
console.error('Error sending data:', error);
return false;
}
} else {
console.log('WebSocket not ready, queueing message');
this.messageQueue.push(data);
return true;
}
}
}
console.log("Creating WebSocketClient instance...");
const wsClient = new WebSocketClient();
console.log("Setting up message listener...");
browser.runtime.onMessage.addListener((message, sender, sendResponse) => { browser.runtime.onMessage.addListener((message, sender, sendResponse) => {
console.log('Message received from background script:', message);
if (message.type === "PING") { if (message.type === "PING") {
console.log('Received PING, responding...');
return Promise.resolve({ status: "ready" }); return Promise.resolve({ status: "ready" });
} }
if (message.type === "GET_PAGE_CONTENT") { if (message.type === "GET_PAGE_CONTENT") {
console.log('Processing GET_PAGE_CONTENT message'); sendPageContent();
const pageContent = {
url: window.location.href,
html: document.documentElement.outerHTML,
timestamp: new Date().toISOString().replace(/\.\d{3}Z$/, 'Z')
};
console.log('Created page content object for:', pageContent.url);
wsClient.sendMessage(pageContent);
} }
return true; return true;
}); });
// Send initial page content // Send initial page content
console.log('Sending initial page content...'); sendPageContent();
const pageContent = {
url: window.location.href,
html: document.documentElement.outerHTML,
timestamp: new Date().toISOString().replace(/\.\d{3}Z$/, 'Z')
};
wsClient.sendMessage(pageContent);
console.log("Content script initialization complete for:", window.location.href); console.log("Content script initialization complete for:", window.location.href);

View File

@@ -1,84 +0,0 @@
import httpx
import re
from markdownify import markdownify as md
from bs4 import BeautifulSoup
# Patterns for cleaning
SCRIPT_PATTERN = r"<[ ]*script.*?\/[ ]*script[ ]*>"
STYLE_PATTERN = r"<[ ]*style.*?\/[ ]*style[ ]*>"
META_PATTERN = r"<[ ]*meta.*?>"
COMMENT_PATTERN = r"<[ ]*!--.*?--[ ]*>"
LINK_PATTERN = r"<[ ]*link.*?>"
BASE64_IMG_PATTERN = r'<img[^>]+src="data:image/[^;]+;base64,[^"]+"[^>]*>'
SVG_PATTERN = r"(<svg[^>]*>)(.*?)(<\/svg>)"
def clean_html(html: str) -> str:
"""Clean HTML by removing unwanted elements and patterns."""
# First use regex to remove problematic patterns
html = re.sub(SCRIPT_PATTERN, "", html, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL)
html = re.sub(STYLE_PATTERN, "", html, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL)
html = re.sub(META_PATTERN, "", html, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL)
html = re.sub(COMMENT_PATTERN, "", html, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL)
html = re.sub(LINK_PATTERN, "", html, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL)
html = re.sub(SVG_PATTERN, "", html, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL)
html = re.sub(BASE64_IMG_PATTERN, "", html)
# Use BeautifulSoup to remove additional elements we want to strip
soup = BeautifulSoup(html, 'html.parser')
# Remove unwanted elements
elements_to_remove = [
'canvas', 'img', 'picture', 'audio', 'video',
'iframe', 'embed', 'object', 'param', 'track',
'map', 'area', 'source'
]
for element in elements_to_remove:
for tag in soup.find_all(element):
tag.decompose()
return str(soup)
def get_page_html(url: str) -> str:
"""Fetch HTML content from a given URL using httpx."""
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
}
try:
with httpx.Client(follow_redirects=True) as client:
response = client.get(url, headers=headers)
response.raise_for_status()
return response.text
except httpx.HTTPError as e:
print(f"Error fetching page: {e}")
return ""
def clean_whitespace(text: str) -> str:
"""Clean excessive whitespace from text, collapsing more than 2 newlines."""
# Replace 3 or more newlines with 2 newlines
cleaned = re.sub(r'\n{3,}', '\n\n', text)
# Remove trailing whitespace from each line
cleaned = '\n'.join(line.rstrip() for line in cleaned.splitlines())
return cleaned.strip()
def html_to_markdown(url: str) -> str:
"""Convert webpage HTML to markdown."""
html = get_page_html(url)
if not html:
return ""
# Clean the HTML first
cleaned_html = clean_html(html)
# Convert to markdown using markdownify
# Configure markdownify options for clean output
markdown = md(cleaned_html,
heading_style="ATX", # Use # style headers
bullets="-", # Use - for bullets
autolinks=True, # Convert URLs to links
strip=['form'], # Additional elements to strip
escape_asterisks=True,
escape_underscores=True)
# Clean up excessive whitespace
return clean_whitespace(markdown)

View File

@@ -2,9 +2,10 @@ fastapi
uvicorn uvicorn
sqlalchemy sqlalchemy
browser-history browser-history
beautifulsoup4 beautifulsoup4>=4.9.3
markdownify markdownify
pyyaml pyyaml>=6.0.1
pytz pytz
websockets==11.0.3 websockets==11.0.3
iso8601==2.1.0 iso8601==2.1.0
lxml>=4.9.3