mirror of
https://github.com/Zetaphor/browser-recall.git
synced 2025-12-06 02:19:37 +00:00
Update extension to use single websocket and like 100 other things
This commit is contained in:
6
.gitignore
vendored
6
.gitignore
vendored
@@ -1 +1,5 @@
|
|||||||
__pycache__/
|
__pycache__/
|
||||||
|
logs/
|
||||||
|
*.db
|
||||||
|
*.db-shm
|
||||||
|
*.db-wal
|
||||||
@@ -3,6 +3,44 @@ from pathlib import Path
|
|||||||
from typing import Set
|
from typing import Set
|
||||||
import fnmatch
|
import fnmatch
|
||||||
|
|
||||||
|
class Config:
|
||||||
|
def __init__(self):
|
||||||
|
self.config_path = Path(__file__).parent / "config.yaml"
|
||||||
|
self.load_config()
|
||||||
|
|
||||||
|
def load_config(self):
|
||||||
|
if not self.config_path.exists():
|
||||||
|
self.config = {"ignored_domains": []}
|
||||||
|
self.save_config()
|
||||||
|
else:
|
||||||
|
with open(self.config_path, 'r') as f:
|
||||||
|
self.config = yaml.safe_load(f)
|
||||||
|
|
||||||
|
def save_config(self):
|
||||||
|
with open(self.config_path, 'w') as f:
|
||||||
|
yaml.dump(self.config, f)
|
||||||
|
|
||||||
|
def is_domain_ignored(self, domain: str) -> bool:
|
||||||
|
"""Check if a domain matches any of the ignored patterns"""
|
||||||
|
patterns = self.config.get('ignored_domains', [])
|
||||||
|
return any(fnmatch.fnmatch(domain.lower(), pattern.lower()) for pattern in patterns)
|
||||||
|
|
||||||
|
def add_ignored_domain(self, pattern: str):
|
||||||
|
"""Add a new domain pattern to the ignored list"""
|
||||||
|
if 'ignored_domains' not in self.config:
|
||||||
|
self.config['ignored_domains'] = []
|
||||||
|
if pattern not in self.config['ignored_domains']:
|
||||||
|
self.config['ignored_domains'].append(pattern)
|
||||||
|
self.save_config()
|
||||||
|
|
||||||
|
def remove_ignored_domain(self, pattern: str):
|
||||||
|
"""Remove a domain pattern from the ignored list"""
|
||||||
|
if 'ignored_domains' in self.config:
|
||||||
|
self.config['ignored_domains'] = [
|
||||||
|
p for p in self.config['ignored_domains'] if p != pattern
|
||||||
|
]
|
||||||
|
self.save_config()
|
||||||
|
|
||||||
class ReaderConfig:
|
class ReaderConfig:
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.excluded_patterns: Set[str] = set()
|
self.excluded_patterns: Set[str] = set()
|
||||||
|
|||||||
13
app/config.yaml
Normal file
13
app/config.yaml
Normal file
@@ -0,0 +1,13 @@
|
|||||||
|
# Domains that should be ignored by the history tracker
|
||||||
|
# Supports wildcards (*) for pattern matching
|
||||||
|
ignored_domains:
|
||||||
|
- "192.168.*" # Ignore local network addresses
|
||||||
|
- "127.0.0.1" # Ignore localhost IP addresses
|
||||||
|
- "localhost" # Ignore localhost domains
|
||||||
|
- "172.*"
|
||||||
|
- "localhost:*" # Ignore all localhost ports
|
||||||
|
- "127.0.0.1:*" # Ignore all localhost IP ports
|
||||||
|
- "*.local" # Ignore .local domains
|
||||||
|
- "about:*" # Ignore about: URLs
|
||||||
|
- "chrome-extension://*" # Ignore Chrome extensions
|
||||||
|
- "chrome://*" # Ignore Chrome URLs
|
||||||
147
app/database.py
147
app/database.py
@@ -1,70 +1,143 @@
|
|||||||
from sqlalchemy import create_engine, Column, Integer, String, DateTime, Text
|
from sqlalchemy import create_engine, Column, Integer, String, DateTime, Text, event
|
||||||
from sqlalchemy.ext.declarative import declarative_base
|
from sqlalchemy.ext.declarative import declarative_base
|
||||||
from sqlalchemy.orm import sessionmaker
|
from sqlalchemy.orm import sessionmaker
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
import sqlite3
|
||||||
|
|
||||||
SQLALCHEMY_DATABASE_URL = "sqlite:///./browser_history.db"
|
SQLALCHEMY_DATABASE_URL = "sqlite:///./browser_history.db"
|
||||||
|
|
||||||
engine = create_engine(SQLALCHEMY_DATABASE_URL)
|
# Create engine with custom configuration
|
||||||
SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
|
engine = create_engine(
|
||||||
|
SQLALCHEMY_DATABASE_URL,
|
||||||
|
connect_args={
|
||||||
|
"timeout": 30, # Connection timeout in seconds
|
||||||
|
"check_same_thread": False, # Allow multi-threaded access
|
||||||
|
},
|
||||||
|
# Enable write-ahead logging and set a larger pool size
|
||||||
|
pool_size=1, # Single connection pool since we're using one connection
|
||||||
|
max_overflow=0, # Prevent additional connections
|
||||||
|
pool_recycle=3600, # Recycle connection every hour
|
||||||
|
)
|
||||||
|
|
||||||
|
SessionLocal = sessionmaker(
|
||||||
|
autocommit=False,
|
||||||
|
autoflush=False,
|
||||||
|
bind=engine,
|
||||||
|
expire_on_commit=False # Prevent unnecessary reloads
|
||||||
|
)
|
||||||
|
|
||||||
Base = declarative_base()
|
Base = declarative_base()
|
||||||
|
|
||||||
|
@event.listens_for(engine, "connect")
|
||||||
|
def set_sqlite_pragma(dbapi_connection, connection_record):
|
||||||
|
"""Configure SQLite for better performance"""
|
||||||
|
if isinstance(dbapi_connection, sqlite3.Connection):
|
||||||
|
cursor = dbapi_connection.cursor()
|
||||||
|
|
||||||
|
# Enable WAL mode for better write performance and concurrency
|
||||||
|
cursor.execute("PRAGMA journal_mode=WAL")
|
||||||
|
|
||||||
|
# Set page size to 4KB for better performance
|
||||||
|
cursor.execute("PRAGMA page_size=4096")
|
||||||
|
|
||||||
|
# Set cache size to 32MB (-32000 pages * 4KB per page = ~32MB)
|
||||||
|
cursor.execute("PRAGMA cache_size=-32000")
|
||||||
|
|
||||||
|
# Enable memory-mapped I/O for better performance
|
||||||
|
cursor.execute("PRAGMA mmap_size=268435456") # 256MB
|
||||||
|
|
||||||
|
# Set synchronous mode to NORMAL for better write performance
|
||||||
|
cursor.execute("PRAGMA synchronous=NORMAL")
|
||||||
|
|
||||||
|
# Enable foreign key support
|
||||||
|
cursor.execute("PRAGMA foreign_keys=ON")
|
||||||
|
|
||||||
|
cursor.close()
|
||||||
|
|
||||||
class HistoryEntry(Base):
|
class HistoryEntry(Base):
|
||||||
__tablename__ = "history"
|
__tablename__ = "history"
|
||||||
|
|
||||||
id = Column(Integer, primary_key=True)
|
id = Column(Integer, primary_key=True)
|
||||||
url = Column(String)
|
url = Column(String, index=True) # Add index for URL lookups
|
||||||
title = Column(String)
|
title = Column(String)
|
||||||
visit_time = Column(DateTime)
|
visit_time = Column(DateTime, index=True) # Add index for time-based queries
|
||||||
domain = Column(String)
|
domain = Column(String, index=True) # Add index for domain filtering
|
||||||
markdown_content = Column(Text, nullable=True)
|
markdown_content = Column(Text, nullable=True)
|
||||||
last_content_update = Column(DateTime, nullable=True)
|
last_content_update = Column(DateTime, nullable=True)
|
||||||
|
|
||||||
|
__table_args__ = (
|
||||||
|
# Composite index for common query patterns
|
||||||
|
{'sqlite_with_rowid': True} # Ensure we have rowids for better performance
|
||||||
|
)
|
||||||
|
|
||||||
class Bookmark(Base):
|
class Bookmark(Base):
|
||||||
__tablename__ = "bookmarks"
|
__tablename__ = "bookmarks"
|
||||||
|
|
||||||
id = Column(Integer, primary_key=True, index=True)
|
id = Column(Integer, primary_key=True)
|
||||||
url = Column(String, index=True)
|
url = Column(String, index=True)
|
||||||
title = Column(String, nullable=True)
|
title = Column(String, nullable=True)
|
||||||
added_time = Column(DateTime, index=True)
|
added_time = Column(DateTime, index=True)
|
||||||
folder = Column(String, index=True)
|
folder = Column(String, index=True)
|
||||||
domain = Column(String, index=True)
|
domain = Column(String, index=True)
|
||||||
|
|
||||||
class BlacklistedDomain(Base):
|
__table_args__ = (
|
||||||
__tablename__ = "blacklisted_domains"
|
# Composite index for common query patterns
|
||||||
|
{'sqlite_with_rowid': True} # Ensure we have rowids for better performance
|
||||||
id = Column(Integer, primary_key=True)
|
)
|
||||||
domain = Column(String, unique=True, index=True)
|
|
||||||
reason = Column(String, nullable=True)
|
|
||||||
added_time = Column(DateTime, default=datetime.utcnow)
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def is_blacklisted(cls, db: SessionLocal, domain: str) -> bool:
|
|
||||||
"""Check if a domain is blacklisted"""
|
|
||||||
return db.query(cls).filter(cls.domain == domain.lower()).first() is not None
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def add_to_blacklist(cls, db: SessionLocal, domain: str, reason: str = None):
|
|
||||||
"""Add a domain to the blacklist"""
|
|
||||||
try:
|
|
||||||
blacklist_entry = cls(
|
|
||||||
domain=domain.lower(),
|
|
||||||
reason=reason
|
|
||||||
)
|
|
||||||
db.add(blacklist_entry)
|
|
||||||
db.commit()
|
|
||||||
except:
|
|
||||||
db.rollback()
|
|
||||||
# If entry already exists, just update the reason
|
|
||||||
existing = db.query(cls).filter(cls.domain == domain.lower()).first()
|
|
||||||
if existing and reason:
|
|
||||||
existing.reason = reason
|
|
||||||
db.commit()
|
|
||||||
|
|
||||||
|
# Create tables
|
||||||
Base.metadata.create_all(bind=engine)
|
Base.metadata.create_all(bind=engine)
|
||||||
|
|
||||||
|
# Initialize FTS tables for full-text search
|
||||||
|
def init_fts():
|
||||||
|
"""Initialize Full Text Search tables"""
|
||||||
|
conn = engine.raw_connection()
|
||||||
|
cursor = conn.cursor()
|
||||||
|
|
||||||
|
# Create FTS table for history content
|
||||||
|
cursor.execute("""
|
||||||
|
CREATE VIRTUAL TABLE IF NOT EXISTS history_fts USING fts5(
|
||||||
|
title,
|
||||||
|
markdown_content,
|
||||||
|
content='history',
|
||||||
|
content_rowid='id',
|
||||||
|
tokenize='porter unicode61'
|
||||||
|
)
|
||||||
|
""")
|
||||||
|
|
||||||
|
# Create triggers to keep FTS index up to date
|
||||||
|
cursor.execute("""
|
||||||
|
CREATE TRIGGER IF NOT EXISTS history_ai AFTER INSERT ON history BEGIN
|
||||||
|
INSERT INTO history_fts(rowid, title, markdown_content)
|
||||||
|
VALUES (new.id, new.title, new.markdown_content);
|
||||||
|
END;
|
||||||
|
""")
|
||||||
|
|
||||||
|
cursor.execute("""
|
||||||
|
CREATE TRIGGER IF NOT EXISTS history_ad AFTER DELETE ON history BEGIN
|
||||||
|
INSERT INTO history_fts(history_fts, rowid, title, markdown_content)
|
||||||
|
VALUES('delete', old.id, old.title, old.markdown_content);
|
||||||
|
END;
|
||||||
|
""")
|
||||||
|
|
||||||
|
cursor.execute("""
|
||||||
|
CREATE TRIGGER IF NOT EXISTS history_au AFTER UPDATE ON history BEGIN
|
||||||
|
INSERT INTO history_fts(history_fts, rowid, title, markdown_content)
|
||||||
|
VALUES('delete', old.id, old.title, old.markdown_content);
|
||||||
|
INSERT INTO history_fts(rowid, title, markdown_content)
|
||||||
|
VALUES (new.id, new.title, new.markdown_content);
|
||||||
|
END;
|
||||||
|
""")
|
||||||
|
|
||||||
|
conn.commit()
|
||||||
|
cursor.close()
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
# Initialize FTS tables
|
||||||
|
init_fts()
|
||||||
|
|
||||||
def get_db():
|
def get_db():
|
||||||
|
"""Get database session"""
|
||||||
db = SessionLocal()
|
db = SessionLocal()
|
||||||
try:
|
try:
|
||||||
yield db
|
yield db
|
||||||
|
|||||||
52
app/logging_config.py
Normal file
52
app/logging_config.py
Normal file
@@ -0,0 +1,52 @@
|
|||||||
|
import logging
|
||||||
|
import logging.handlers
|
||||||
|
import os
|
||||||
|
from datetime import datetime
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
# Create logs directory if it doesn't exist
|
||||||
|
LOGS_DIR = Path("logs")
|
||||||
|
LOGS_DIR.mkdir(exist_ok=True)
|
||||||
|
|
||||||
|
# Create formatters
|
||||||
|
CONSOLE_FORMAT = '%(levelname)s: %(message)s'
|
||||||
|
FILE_FORMAT = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
||||||
|
|
||||||
|
def setup_logger(name: str) -> logging.Logger:
|
||||||
|
"""
|
||||||
|
Set up a logger with both file and console handlers
|
||||||
|
|
||||||
|
Args:
|
||||||
|
name: The name of the logger (usually __name__)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
logging.Logger: Configured logger instance
|
||||||
|
"""
|
||||||
|
logger = logging.getLogger(name)
|
||||||
|
logger.setLevel(logging.INFO)
|
||||||
|
|
||||||
|
# Prevent adding handlers multiple times
|
||||||
|
if logger.handlers:
|
||||||
|
return logger
|
||||||
|
|
||||||
|
# Console handler
|
||||||
|
console_handler = logging.StreamHandler()
|
||||||
|
console_handler.setLevel(logging.WARNING)
|
||||||
|
console_handler.setFormatter(logging.Formatter(CONSOLE_FORMAT))
|
||||||
|
|
||||||
|
# File handler
|
||||||
|
log_file = LOGS_DIR / f"{datetime.now().strftime('%Y-%m')}.log"
|
||||||
|
file_handler = logging.handlers.RotatingFileHandler(
|
||||||
|
log_file,
|
||||||
|
maxBytes=10*1024*1024, # 10MB
|
||||||
|
backupCount=5,
|
||||||
|
encoding='utf-8'
|
||||||
|
)
|
||||||
|
file_handler.setLevel(logging.INFO)
|
||||||
|
file_handler.setFormatter(logging.Formatter(FILE_FORMAT))
|
||||||
|
|
||||||
|
# Add handlers
|
||||||
|
logger.addHandler(console_handler)
|
||||||
|
logger.addHandler(file_handler)
|
||||||
|
|
||||||
|
return logger
|
||||||
305
app/main.py
305
app/main.py
@@ -1,6 +1,6 @@
|
|||||||
from fastapi import FastAPI, Depends, Query, WebSocket, WebSocketDisconnect
|
from fastapi import FastAPI, Depends, Query, WebSocket, WebSocketDisconnect, HTTPException
|
||||||
from sqlalchemy.orm import Session
|
from sqlalchemy.orm import Session
|
||||||
from datetime import datetime, timezone
|
from datetime import datetime, timezone, timedelta
|
||||||
from typing import List, Optional
|
from typing import List, Optional
|
||||||
import asyncio
|
import asyncio
|
||||||
from fastapi import WebSocketDisconnect
|
from fastapi import WebSocketDisconnect
|
||||||
@@ -8,14 +8,22 @@ from urllib.parse import urlparse
|
|||||||
import pytz
|
import pytz
|
||||||
from fastapi.middleware.cors import CORSMiddleware
|
from fastapi.middleware.cors import CORSMiddleware
|
||||||
import iso8601
|
import iso8601
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
from sqlalchemy import text
|
||||||
|
from sqlalchemy.sql import text
|
||||||
|
from .logging_config import setup_logger
|
||||||
|
|
||||||
from .database import get_db, HistoryEntry, Bookmark
|
from .database import get_db, HistoryEntry, Bookmark
|
||||||
from .scheduler import HistoryScheduler
|
from .scheduler import HistoryScheduler
|
||||||
from .page_info import PageInfo
|
from .page_info import PageInfo
|
||||||
from .page_reader import PageReader
|
from .page_reader import PageReader
|
||||||
|
from .config import Config
|
||||||
|
|
||||||
|
logger = setup_logger(__name__)
|
||||||
|
|
||||||
app = FastAPI()
|
app = FastAPI()
|
||||||
scheduler = HistoryScheduler()
|
scheduler = HistoryScheduler()
|
||||||
|
config = Config()
|
||||||
|
|
||||||
# Add CORS middleware to allow WebSocket connections
|
# Add CORS middleware to allow WebSocket connections
|
||||||
app.add_middleware(
|
app.add_middleware(
|
||||||
@@ -28,6 +36,7 @@ app.add_middleware(
|
|||||||
|
|
||||||
@app.on_event("startup")
|
@app.on_event("startup")
|
||||||
async def startup_event():
|
async def startup_event():
|
||||||
|
logger.info("Starting application")
|
||||||
# Initial bookmark fetch
|
# Initial bookmark fetch
|
||||||
await scheduler.update_bookmarks()
|
await scheduler.update_bookmarks()
|
||||||
# Start the background task
|
# Start the background task
|
||||||
@@ -35,13 +44,24 @@ async def startup_event():
|
|||||||
|
|
||||||
def serialize_history_entry(entry, include_content: bool = False):
|
def serialize_history_entry(entry, include_content: bool = False):
|
||||||
"""Serialize a HistoryEntry object to a dictionary"""
|
"""Serialize a HistoryEntry object to a dictionary"""
|
||||||
result = {
|
# Handle both ORM objects and raw SQL results
|
||||||
"id": entry.id,
|
if hasattr(entry, '_mapping'): # Raw SQL result
|
||||||
"url": entry.url,
|
result = {
|
||||||
"title": entry.title,
|
"id": entry.id,
|
||||||
"visit_time": entry.visit_time.isoformat() if entry.visit_time else None,
|
"url": entry.url,
|
||||||
"domain": entry.domain,
|
"title": entry.title,
|
||||||
}
|
"visit_time": entry.visit_time.isoformat() if isinstance(entry.visit_time, datetime) else entry.visit_time,
|
||||||
|
"domain": entry.domain,
|
||||||
|
}
|
||||||
|
else: # ORM object
|
||||||
|
result = {
|
||||||
|
"id": entry.id,
|
||||||
|
"url": entry.url,
|
||||||
|
"title": entry.title,
|
||||||
|
"visit_time": entry.visit_time.isoformat() if entry.visit_time else None,
|
||||||
|
"domain": entry.domain,
|
||||||
|
}
|
||||||
|
|
||||||
if include_content:
|
if include_content:
|
||||||
result["markdown_content"] = entry.markdown_content
|
result["markdown_content"] = entry.markdown_content
|
||||||
return result
|
return result
|
||||||
@@ -66,25 +86,54 @@ async def search_history(
|
|||||||
include_content: bool = Query(False),
|
include_content: bool = Query(False),
|
||||||
db: Session = Depends(get_db)
|
db: Session = Depends(get_db)
|
||||||
):
|
):
|
||||||
query = db.query(HistoryEntry)
|
"""Search history with optimized full-text search"""
|
||||||
|
try:
|
||||||
|
# If there's a full-text search term, use the FTS table
|
||||||
|
if search_term:
|
||||||
|
# Use raw SQL for FTS query to leverage SQLite's optimization
|
||||||
|
fts_query = """
|
||||||
|
SELECT h.* FROM history h
|
||||||
|
INNER JOIN history_fts f ON h.id = f.rowid
|
||||||
|
WHERE history_fts MATCH :search
|
||||||
|
AND (:domain IS NULL OR h.domain = :domain)
|
||||||
|
AND (:start_date IS NULL OR h.visit_time >= :start_date)
|
||||||
|
AND (:end_date IS NULL OR h.visit_time <= :end_date)
|
||||||
|
ORDER BY rank
|
||||||
|
LIMIT 1000
|
||||||
|
"""
|
||||||
|
results = db.execute(
|
||||||
|
text(fts_query),
|
||||||
|
{
|
||||||
|
'search': search_term,
|
||||||
|
'domain': domain,
|
||||||
|
'start_date': start_date,
|
||||||
|
'end_date': end_date
|
||||||
|
}
|
||||||
|
).all()
|
||||||
|
|
||||||
if domain:
|
# Return serialized results directly
|
||||||
query = query.filter(HistoryEntry.domain == domain)
|
return [serialize_history_entry(row, include_content) for row in results]
|
||||||
|
else:
|
||||||
|
# Start with base query
|
||||||
|
query = db.query(HistoryEntry)
|
||||||
|
|
||||||
if start_date:
|
# Apply filters
|
||||||
query = query.filter(HistoryEntry.visit_time >= start_date)
|
if domain:
|
||||||
|
query = query.filter(HistoryEntry.domain == domain)
|
||||||
|
|
||||||
if end_date:
|
if start_date:
|
||||||
query = query.filter(HistoryEntry.visit_time <= end_date)
|
query = query.filter(HistoryEntry.visit_time >= start_date)
|
||||||
|
|
||||||
if search_term:
|
if end_date:
|
||||||
query = query.filter(
|
query = query.filter(HistoryEntry.visit_time <= end_date)
|
||||||
(HistoryEntry.title.ilike(f"%{search_term}%")) |
|
|
||||||
(HistoryEntry.markdown_content.ilike(f"%{search_term}%"))
|
|
||||||
)
|
|
||||||
|
|
||||||
entries = query.all()
|
# Execute query with limit for better performance
|
||||||
return [serialize_history_entry(entry, include_content) for entry in entries]
|
entries = query.limit(1000).all()
|
||||||
|
return [serialize_history_entry(entry, include_content) for entry in entries]
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Search error: {e}")
|
||||||
|
raise HTTPException(status_code=500, detail="Search operation failed")
|
||||||
|
|
||||||
@app.get("/bookmarks/search")
|
@app.get("/bookmarks/search")
|
||||||
async def search_bookmarks(
|
async def search_bookmarks(
|
||||||
@@ -93,84 +142,204 @@ async def search_bookmarks(
|
|||||||
search_term: Optional[str] = Query(None),
|
search_term: Optional[str] = Query(None),
|
||||||
db: Session = Depends(get_db)
|
db: Session = Depends(get_db)
|
||||||
):
|
):
|
||||||
query = db.query(Bookmark)
|
"""Search bookmarks with optimized queries"""
|
||||||
|
try:
|
||||||
|
# Build query efficiently
|
||||||
|
query = db.query(Bookmark)
|
||||||
|
|
||||||
if domain:
|
# Apply filters using index-optimized queries
|
||||||
query = query.filter(Bookmark.domain == domain)
|
if domain:
|
||||||
|
query = query.filter(Bookmark.domain == domain)
|
||||||
|
|
||||||
if folder:
|
if folder:
|
||||||
query = query.filter(Bookmark.folder == folder)
|
query = query.filter(Bookmark.folder == folder)
|
||||||
|
|
||||||
if search_term:
|
if search_term:
|
||||||
query = query.filter(Bookmark.title.ilike(f"%{search_term}%"))
|
# Use LIKE with index hint for title search
|
||||||
|
search_pattern = f"%{search_term}%"
|
||||||
|
query = query.filter(
|
||||||
|
Bookmark.title.ilike(search_pattern)
|
||||||
|
).with_hint(
|
||||||
|
Bookmark,
|
||||||
|
'INDEXED BY ix_bookmarks_title',
|
||||||
|
'sqlite'
|
||||||
|
)
|
||||||
|
|
||||||
bookmarks = query.all()
|
# Add ordering and limit for better performance
|
||||||
return [serialize_bookmark(bookmark) for bookmark in bookmarks]
|
bookmarks = query.order_by(Bookmark.added_time.desc()).limit(1000).all()
|
||||||
|
|
||||||
|
return [serialize_bookmark(bookmark) for bookmark in bookmarks]
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Bookmark search error: {e}")
|
||||||
|
raise HTTPException(status_code=500, detail="Search operation failed")
|
||||||
|
|
||||||
|
# Add new endpoint for advanced full-text search
|
||||||
|
@app.get("/history/search/advanced")
|
||||||
|
async def advanced_history_search(
|
||||||
|
query: str = Query(..., description="Full-text search query with SQLite FTS5 syntax"),
|
||||||
|
include_content: bool = Query(False),
|
||||||
|
db: Session = Depends(get_db)
|
||||||
|
):
|
||||||
|
"""Advanced full-text search using SQLite FTS5 features"""
|
||||||
|
try:
|
||||||
|
# Use raw SQL for advanced FTS query
|
||||||
|
fts_query = """
|
||||||
|
SELECT h.*, rank
|
||||||
|
FROM history h
|
||||||
|
INNER JOIN history_fts f ON h.id = f.rowid
|
||||||
|
WHERE history_fts MATCH :query
|
||||||
|
ORDER BY rank
|
||||||
|
LIMIT 1000
|
||||||
|
"""
|
||||||
|
|
||||||
|
results = db.execute(text(fts_query), {'query': query}).all()
|
||||||
|
|
||||||
|
# Convert results to HistoryEntry objects
|
||||||
|
entries = [
|
||||||
|
serialize_history_entry(
|
||||||
|
HistoryEntry(
|
||||||
|
id=row.id,
|
||||||
|
url=row.url,
|
||||||
|
title=row.title,
|
||||||
|
visit_time=row.visit_time,
|
||||||
|
domain=row.domain,
|
||||||
|
markdown_content=row.markdown_content if include_content else None
|
||||||
|
),
|
||||||
|
include_content
|
||||||
|
)
|
||||||
|
for row in results
|
||||||
|
]
|
||||||
|
|
||||||
|
return entries
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Advanced search error: {e}")
|
||||||
|
raise HTTPException(status_code=500, detail="Advanced search operation failed")
|
||||||
|
|
||||||
@app.websocket("/ws")
|
@app.websocket("/ws")
|
||||||
async def websocket_endpoint(websocket: WebSocket, db: Session = Depends(get_db)):
|
async def websocket_endpoint(websocket: WebSocket, db: Session = Depends(get_db)):
|
||||||
print("WebSocket endpoint called")
|
logger.info("New WebSocket connection established")
|
||||||
page_reader = PageReader()
|
page_reader = PageReader()
|
||||||
print("New WebSocket connection established")
|
|
||||||
await websocket.accept()
|
await websocket.accept()
|
||||||
print("WebSocket connection accepted")
|
|
||||||
try:
|
try:
|
||||||
while True:
|
while True:
|
||||||
print("Waiting for message...")
|
|
||||||
data = await websocket.receive_json()
|
data = await websocket.receive_json()
|
||||||
print(f"Received message for URL: {data['url']}")
|
|
||||||
print(f"HTML content length: {len(data['html'])}")
|
|
||||||
print(f"Timestamp: {data['timestamp']}")
|
|
||||||
|
|
||||||
# Parse the ISO timestamp correctly
|
# Parse the URL and check if domain should be ignored
|
||||||
|
domain = urlparse(data['url']).netloc
|
||||||
|
if config.is_domain_ignored(domain):
|
||||||
|
logger.info(f"Ignoring domain: {domain}")
|
||||||
|
await websocket.send_json({
|
||||||
|
"status": "ignored",
|
||||||
|
"message": f"Domain {domain} is in ignore list"
|
||||||
|
})
|
||||||
|
continue
|
||||||
|
|
||||||
|
logger.info(f"Processing page: {data['url']}")
|
||||||
timestamp = iso8601.parse_date(data['timestamp'])
|
timestamp = iso8601.parse_date(data['timestamp'])
|
||||||
|
|
||||||
|
# Check if we already have a recent entry for this URL
|
||||||
|
existing_entry = db.query(HistoryEntry).filter(
|
||||||
|
HistoryEntry.url == data['url'],
|
||||||
|
HistoryEntry.visit_time >= timestamp - timedelta(minutes=5)
|
||||||
|
).first()
|
||||||
|
|
||||||
|
if existing_entry:
|
||||||
|
print(f"Recent entry exists for URL: {data['url']}")
|
||||||
|
await websocket.send_json({
|
||||||
|
"status": "skipped",
|
||||||
|
"message": "Recent entry exists"
|
||||||
|
})
|
||||||
|
continue
|
||||||
|
|
||||||
page_info = PageInfo(
|
page_info = PageInfo(
|
||||||
url=data['url'],
|
url=data['url'],
|
||||||
html=data['html'],
|
html=data['html'],
|
||||||
timestamp=timestamp
|
timestamp=timestamp
|
||||||
)
|
)
|
||||||
print(f"Created PageInfo object for: {page_info.url}")
|
|
||||||
|
|
||||||
# Convert HTML to markdown
|
# Debug HTML content
|
||||||
print("Converting HTML to markdown...")
|
print(f"HTML content length before processing: {len(page_info.html)}")
|
||||||
|
|
||||||
|
# Extract title
|
||||||
|
soup = BeautifulSoup(page_info.html, 'html.parser')
|
||||||
|
title = soup.title.string if soup.title else ''
|
||||||
|
print(f"Extracted title: {title}")
|
||||||
|
|
||||||
|
# Debug markdown conversion
|
||||||
|
print("Starting markdown conversion...")
|
||||||
|
cleaned_html = page_reader.clean_html(page_info.html)
|
||||||
|
print(f"Cleaned HTML length: {len(cleaned_html)}")
|
||||||
|
|
||||||
markdown_content = page_reader.html_to_markdown(page_info.html)
|
markdown_content = page_reader.html_to_markdown(page_info.html)
|
||||||
print(f"Markdown conversion complete, length: {len(markdown_content) if markdown_content else 0}")
|
print(f"Markdown conversion complete. Content length: {len(markdown_content) if markdown_content else 0}")
|
||||||
|
|
||||||
# Update or create history entry
|
if markdown_content:
|
||||||
domain = urlparse(page_info.url).netloc
|
print("First 100 chars of markdown:", markdown_content[:100])
|
||||||
print(f"Creating history entry for domain: {domain}")
|
else:
|
||||||
|
print("No markdown content generated")
|
||||||
|
|
||||||
|
if not title and not markdown_content:
|
||||||
|
print(f"No content extracted from: {page_info.url}")
|
||||||
|
await websocket.send_json({
|
||||||
|
"status": "skipped",
|
||||||
|
"message": "No content extracted"
|
||||||
|
})
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Create history entry
|
||||||
history_entry = HistoryEntry(
|
history_entry = HistoryEntry(
|
||||||
url=page_info.url,
|
url=page_info.url,
|
||||||
|
title=title,
|
||||||
visit_time=page_info.timestamp,
|
visit_time=page_info.timestamp,
|
||||||
domain=domain,
|
domain=domain,
|
||||||
markdown_content=markdown_content,
|
markdown_content=markdown_content,
|
||||||
last_content_update=datetime.now(timezone.utc)
|
last_content_update=datetime.now(timezone.utc)
|
||||||
)
|
)
|
||||||
|
|
||||||
print("Saving to database...")
|
# Debug database operation
|
||||||
db.add(history_entry)
|
print(f"Saving entry with markdown length: {len(markdown_content) if markdown_content else 0}")
|
||||||
db.commit()
|
|
||||||
print("Database save complete")
|
|
||||||
|
|
||||||
# Send confirmation back to client
|
# Use bulk operations for better performance
|
||||||
await websocket.send_json({
|
db.add(history_entry)
|
||||||
"status": "success",
|
|
||||||
"message": f"Processed page: {page_info.url}"
|
try:
|
||||||
})
|
db.commit()
|
||||||
|
print(f"Successfully saved entry for: {page_info.url}")
|
||||||
|
print(f"Verify markdown content length in database: {len(history_entry.markdown_content) if history_entry.markdown_content else 0}")
|
||||||
|
await websocket.send_json({
|
||||||
|
"status": "success",
|
||||||
|
"message": f"Processed page: {page_info.url}"
|
||||||
|
})
|
||||||
|
except Exception as e:
|
||||||
|
db.rollback()
|
||||||
|
print(f"Error saving entry: {e}")
|
||||||
|
await websocket.send_json({
|
||||||
|
"status": "error",
|
||||||
|
"message": "Database error"
|
||||||
|
})
|
||||||
|
|
||||||
except WebSocketDisconnect:
|
except WebSocketDisconnect:
|
||||||
print("Client disconnected")
|
logger.info("Client disconnected")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Error handling message: {e}")
|
logger.error("Error in WebSocket handler", exc_info=True)
|
||||||
# Send error back to client if possible
|
|
||||||
try:
|
|
||||||
await websocket.send_json({
|
|
||||||
"status": "error",
|
|
||||||
"message": str(e)
|
|
||||||
})
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
finally:
|
finally:
|
||||||
print("Cleaning up resources")
|
await page_reader.close()
|
||||||
page_reader.close()
|
|
||||||
|
@app.get("/config/ignored-domains")
|
||||||
|
async def get_ignored_domains():
|
||||||
|
"""Get list of ignored domain patterns"""
|
||||||
|
return {"ignored_domains": config.config.get('ignored_domains', [])}
|
||||||
|
|
||||||
|
@app.post("/config/ignored-domains")
|
||||||
|
async def add_ignored_domain(pattern: str):
|
||||||
|
"""Add a new domain pattern to ignored list"""
|
||||||
|
config.add_ignored_domain(pattern)
|
||||||
|
return {"status": "success", "message": f"Added pattern: {pattern}"}
|
||||||
|
|
||||||
|
@app.delete("/config/ignored-domains/{pattern}")
|
||||||
|
async def remove_ignored_domain(pattern: str):
|
||||||
|
"""Remove a domain pattern from ignored list"""
|
||||||
|
config.remove_ignored_domain(pattern)
|
||||||
|
return {"status": "success", "message": f"Removed pattern: {pattern}"}
|
||||||
@@ -4,15 +4,11 @@ from bs4 import BeautifulSoup
|
|||||||
from typing import Optional
|
from typing import Optional
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
from .config import ReaderConfig
|
from .config import ReaderConfig
|
||||||
import logging
|
from .logging_config import setup_logger
|
||||||
from .database import SessionLocal, BlacklistedDomain
|
from .database import SessionLocal
|
||||||
|
|
||||||
# Setup logging with less verbose output
|
# Setup logger for this module
|
||||||
logging.basicConfig(
|
logger = setup_logger(__name__)
|
||||||
level=logging.WARNING,
|
|
||||||
format='%(levelname)s: %(message)s'
|
|
||||||
)
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
# Patterns for cleaning
|
# Patterns for cleaning
|
||||||
SCRIPT_PATTERN = r"<[ ]*script.*?\/[ ]*script[ ]*>"
|
SCRIPT_PATTERN = r"<[ ]*script.*?\/[ ]*script[ ]*>"
|
||||||
@@ -26,13 +22,15 @@ SVG_PATTERN = r"(<svg[^>]*>)(.*?)(<\/svg>)"
|
|||||||
class PageReader:
|
class PageReader:
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.config = ReaderConfig()
|
self.config = ReaderConfig()
|
||||||
self.db = SessionLocal()
|
logger.info("PageReader initialized")
|
||||||
|
|
||||||
def clean_html(self, html: str) -> str:
|
def clean_html(self, html: str) -> str:
|
||||||
"""Clean HTML by removing unwanted elements and patterns."""
|
"""Clean HTML by removing unwanted elements and patterns."""
|
||||||
if not html:
|
if not html:
|
||||||
|
logger.warning("Received empty HTML to clean")
|
||||||
return ""
|
return ""
|
||||||
|
|
||||||
|
logger.debug(f"Cleaning HTML of length: {len(html)}")
|
||||||
# First use regex to remove problematic patterns
|
# First use regex to remove problematic patterns
|
||||||
html = re.sub(SCRIPT_PATTERN, "", html, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL)
|
html = re.sub(SCRIPT_PATTERN, "", html, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL)
|
||||||
html = re.sub(STYLE_PATTERN, "", html, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL)
|
html = re.sub(STYLE_PATTERN, "", html, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL)
|
||||||
@@ -54,12 +52,15 @@ class PageReader:
|
|||||||
]
|
]
|
||||||
|
|
||||||
for element in elements_to_remove:
|
for element in elements_to_remove:
|
||||||
|
removed = len(soup.find_all(element))
|
||||||
|
if removed:
|
||||||
|
logger.debug(f"Removed {removed} {element} elements")
|
||||||
for tag in soup.find_all(element):
|
for tag in soup.find_all(element):
|
||||||
tag.decompose()
|
tag.decompose()
|
||||||
|
|
||||||
return str(soup)
|
return str(soup)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Error cleaning HTML: {e}")
|
logger.error(f"Error cleaning HTML: {e}", exc_info=True)
|
||||||
return ""
|
return ""
|
||||||
|
|
||||||
def clean_whitespace(self, text: str) -> str:
|
def clean_whitespace(self, text: str) -> str:
|
||||||
@@ -80,11 +81,17 @@ class PageReader:
|
|||||||
def html_to_markdown(self, html: str) -> Optional[str]:
|
def html_to_markdown(self, html: str) -> Optional[str]:
|
||||||
"""Convert HTML to markdown."""
|
"""Convert HTML to markdown."""
|
||||||
try:
|
try:
|
||||||
|
logger.info("Starting HTML to Markdown conversion")
|
||||||
|
logger.debug(f"Input HTML length: {len(html)}")
|
||||||
|
|
||||||
cleaned_html = self.clean_html(html)
|
cleaned_html = self.clean_html(html)
|
||||||
|
logger.debug(f"Cleaned HTML length: {len(cleaned_html)}")
|
||||||
|
|
||||||
if not cleaned_html:
|
if not cleaned_html:
|
||||||
|
logger.warning("No cleaned HTML content")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
return self.clean_whitespace(md(cleaned_html,
|
markdown = self.clean_whitespace(md(cleaned_html,
|
||||||
heading_style="ATX",
|
heading_style="ATX",
|
||||||
bullets="-",
|
bullets="-",
|
||||||
autolinks=True,
|
autolinks=True,
|
||||||
@@ -92,10 +99,19 @@ class PageReader:
|
|||||||
escape_asterisks=True,
|
escape_asterisks=True,
|
||||||
escape_underscores=True))
|
escape_underscores=True))
|
||||||
|
|
||||||
|
logger.debug(f"Generated markdown length: {len(markdown) if markdown else 0}")
|
||||||
|
|
||||||
|
if not markdown or markdown.isspace():
|
||||||
|
logger.warning("Markdown is empty or whitespace only")
|
||||||
|
return None
|
||||||
|
|
||||||
|
return markdown
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Error converting to markdown: {e}")
|
logger.error("Error converting to markdown", exc_info=True)
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def close(self):
|
async def close(self):
|
||||||
"""Cleanup resources"""
|
"""Cleanup resources"""
|
||||||
self.db.close()
|
logger.info("Closing PageReader")
|
||||||
|
pass # No need to close DB connection anymore
|
||||||
@@ -7,6 +7,9 @@ from .page_reader import PageReader
|
|||||||
from sqlalchemy import func
|
from sqlalchemy import func
|
||||||
from sqlalchemy.orm import Session
|
from sqlalchemy.orm import Session
|
||||||
import pytz
|
import pytz
|
||||||
|
from .config import Config
|
||||||
|
from .database import get_db
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
class HistoryScheduler:
|
class HistoryScheduler:
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
@@ -14,6 +17,7 @@ class HistoryScheduler:
|
|||||||
self.page_reader = PageReader()
|
self.page_reader = PageReader()
|
||||||
self.last_history_update = None
|
self.last_history_update = None
|
||||||
self.content_update_interval = timedelta(hours=24) # Update content daily
|
self.content_update_interval = timedelta(hours=24) # Update content daily
|
||||||
|
self.config = Config()
|
||||||
|
|
||||||
def _normalize_datetime(self, dt: datetime) -> datetime:
|
def _normalize_datetime(self, dt: datetime) -> datetime:
|
||||||
"""Convert datetime to UTC if it has timezone, or make it timezone-aware if it doesn't"""
|
"""Convert datetime to UTC if it has timezone, or make it timezone-aware if it doesn't"""
|
||||||
@@ -28,81 +32,70 @@ class HistoryScheduler:
|
|||||||
return dt.astimezone(pytz.UTC)
|
return dt.astimezone(pytz.UTC)
|
||||||
|
|
||||||
async def update_bookmarks(self):
|
async def update_bookmarks(self):
|
||||||
bookmarks = self.browser_collector.fetch_bookmarks()
|
"""Update bookmarks from browser"""
|
||||||
|
|
||||||
db = SessionLocal()
|
|
||||||
try:
|
try:
|
||||||
# First, get all existing URLs to avoid duplicates
|
db = next(get_db())
|
||||||
existing_urls = {
|
bookmarks = self.browser_collector.fetch_bookmarks()
|
||||||
url: (added_time, folder)
|
|
||||||
for url, added_time, folder in
|
for added_time, url, title, folder in bookmarks: # Unpack the tuple
|
||||||
db.query(Bookmark.url, Bookmark.added_time, Bookmark.folder).all()
|
# Extract domain and check if it should be ignored
|
||||||
}
|
domain = urlparse(url).netloc
|
||||||
|
if self.config.is_domain_ignored(domain):
|
||||||
|
continue
|
||||||
|
|
||||||
new_entries = []
|
|
||||||
for added_time, url, title, folder in bookmarks:
|
|
||||||
# Normalize the datetime
|
# Normalize the datetime
|
||||||
added_time = self._normalize_datetime(added_time)
|
added_time = self._normalize_datetime(added_time)
|
||||||
|
|
||||||
# Only add if URL doesn't exist or if it's in a different folder
|
# Process the bookmark only if domain is not ignored
|
||||||
if (url not in existing_urls or
|
bookmark_entry = Bookmark(
|
||||||
existing_urls[url][1] != folder):
|
url=url,
|
||||||
domain = self.browser_collector.get_domain(url)
|
title=title,
|
||||||
entry = Bookmark(
|
added_time=added_time,
|
||||||
url=url,
|
folder=folder,
|
||||||
title=title,
|
domain=domain
|
||||||
added_time=added_time,
|
)
|
||||||
folder=folder,
|
db.add(bookmark_entry)
|
||||||
domain=domain
|
|
||||||
)
|
|
||||||
new_entries.append(entry)
|
|
||||||
|
|
||||||
if new_entries:
|
db.commit()
|
||||||
db.bulk_save_objects(new_entries)
|
|
||||||
db.commit()
|
except Exception as e:
|
||||||
|
print(f"Error updating bookmarks: {e}")
|
||||||
finally:
|
finally:
|
||||||
db.close()
|
db.close()
|
||||||
|
|
||||||
async def update_history(self):
|
async def update_history(self):
|
||||||
|
"""Background task to update history periodically"""
|
||||||
while True:
|
while True:
|
||||||
db = SessionLocal()
|
|
||||||
try:
|
try:
|
||||||
# Get the latest timestamp from our database
|
db = next(get_db())
|
||||||
latest_entry = db.query(func.max(HistoryEntry.visit_time)).scalar()
|
history_entries = self.browser_collector.fetch_history()
|
||||||
if latest_entry:
|
|
||||||
latest_entry = self._normalize_datetime(latest_entry)
|
|
||||||
|
|
||||||
# Fetch new history
|
for visit_time, url, title in history_entries: # Unpack the tuple
|
||||||
history = self.browser_collector.fetch_history()
|
# Extract domain and check if it should be ignored
|
||||||
|
domain = urlparse(url).netloc
|
||||||
|
if self.config.is_domain_ignored(domain):
|
||||||
|
continue
|
||||||
|
|
||||||
# Filter to only get entries newer than our latest entry
|
|
||||||
new_entries = []
|
|
||||||
for visit_time, url, title in history:
|
|
||||||
# Normalize the datetime
|
# Normalize the datetime
|
||||||
visit_time = self._normalize_datetime(visit_time)
|
visit_time = self._normalize_datetime(visit_time)
|
||||||
|
|
||||||
if not latest_entry or visit_time > latest_entry:
|
# Process the entry only if domain is not ignored
|
||||||
domain = self.browser_collector.get_domain(url)
|
history_entry = HistoryEntry(
|
||||||
entry = HistoryEntry(
|
url=url,
|
||||||
url=url,
|
title=title,
|
||||||
title=title,
|
visit_time=visit_time,
|
||||||
visit_time=visit_time,
|
domain=domain
|
||||||
domain=domain
|
)
|
||||||
)
|
db.add(history_entry)
|
||||||
new_entries.append(entry)
|
|
||||||
|
|
||||||
if new_entries:
|
db.commit()
|
||||||
db.bulk_save_objects(new_entries)
|
|
||||||
db.commit()
|
|
||||||
|
|
||||||
# Update bookmarks
|
|
||||||
await self.update_bookmarks()
|
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error updating history: {e}")
|
||||||
finally:
|
finally:
|
||||||
db.close()
|
db.close()
|
||||||
|
|
||||||
# Wait for 5 minutes before next update
|
await asyncio.sleep(300) # Wait 5 minutes before next update
|
||||||
await asyncio.sleep(300)
|
|
||||||
|
|
||||||
async def close(self):
|
async def close(self):
|
||||||
"""Cleanup resources"""
|
"""Cleanup resources"""
|
||||||
|
|||||||
@@ -1,5 +1,82 @@
|
|||||||
console.log("Background script loaded");
|
console.log("Background script loaded");
|
||||||
|
|
||||||
|
class WebSocketClient {
|
||||||
|
constructor() {
|
||||||
|
console.log("WebSocketClient constructor called");
|
||||||
|
this.messageQueue = [];
|
||||||
|
this.connect();
|
||||||
|
this.reconnectAttempts = 0;
|
||||||
|
this.maxReconnectAttempts = 5;
|
||||||
|
}
|
||||||
|
|
||||||
|
connect() {
|
||||||
|
console.log('Attempting to connect to WebSocket server...');
|
||||||
|
try {
|
||||||
|
this.ws = new WebSocket('ws://localhost:8523/ws');
|
||||||
|
console.log('WebSocket instance created');
|
||||||
|
|
||||||
|
this.ws.addEventListener('open', () => {
|
||||||
|
console.log('WebSocket connection opened successfully');
|
||||||
|
this.reconnectAttempts = 0;
|
||||||
|
this.processQueue();
|
||||||
|
});
|
||||||
|
|
||||||
|
this.ws.addEventListener('error', (event) => {
|
||||||
|
console.error('WebSocket error occurred:', event);
|
||||||
|
});
|
||||||
|
|
||||||
|
this.ws.addEventListener('close', (event) => {
|
||||||
|
console.log('WebSocket connection closed:', event.code, event.reason);
|
||||||
|
this.tryReconnect();
|
||||||
|
});
|
||||||
|
|
||||||
|
this.ws.addEventListener('message', (event) => {
|
||||||
|
console.log('Received message from server:', event.data);
|
||||||
|
});
|
||||||
|
} catch (error) {
|
||||||
|
console.error('Error creating WebSocket:', error);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
processQueue() {
|
||||||
|
console.log(`Processing message queue (${this.messageQueue.length} messages)`);
|
||||||
|
while (this.messageQueue.length > 0) {
|
||||||
|
const data = this.messageQueue.shift();
|
||||||
|
this.sendMessage(data);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
tryReconnect() {
|
||||||
|
if (this.reconnectAttempts < this.maxReconnectAttempts) {
|
||||||
|
this.reconnectAttempts++;
|
||||||
|
console.log(`Attempting to reconnect (${this.reconnectAttempts}/${this.maxReconnectAttempts})...`);
|
||||||
|
setTimeout(() => this.connect(), 2000 * this.reconnectAttempts);
|
||||||
|
} else {
|
||||||
|
console.log('Max reconnection attempts reached');
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
sendMessage(data) {
|
||||||
|
if (this.ws.readyState === WebSocket.OPEN) {
|
||||||
|
try {
|
||||||
|
console.log('Sending data for URL:', data.url);
|
||||||
|
this.ws.send(JSON.stringify(data));
|
||||||
|
console.log('Data sent successfully');
|
||||||
|
return true;
|
||||||
|
} catch (error) {
|
||||||
|
console.error('Error sending data:', error);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
console.log('WebSocket not ready, queueing message');
|
||||||
|
this.messageQueue.push(data);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const wsClient = new WebSocketClient();
|
||||||
|
|
||||||
async function isContentScriptReady(tabId) {
|
async function isContentScriptReady(tabId) {
|
||||||
try {
|
try {
|
||||||
await browser.tabs.sendMessage(tabId, { type: "PING" });
|
await browser.tabs.sendMessage(tabId, { type: "PING" });
|
||||||
@@ -38,9 +115,17 @@ async function sendMessageToTab(tabId) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Listen for messages from content scripts
|
||||||
|
browser.runtime.onMessage.addListener((message, sender) => {
|
||||||
|
if (message.type === "SEND_PAGE_CONTENT") {
|
||||||
|
console.log('Received page content from tab:', sender.tab.id);
|
||||||
|
wsClient.sendMessage(message.data);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
browser.webNavigation.onCompleted.addListener(async (details) => {
|
browser.webNavigation.onCompleted.addListener(async (details) => {
|
||||||
console.log("Navigation completed", details);
|
console.log("Navigation completed", details);
|
||||||
if (details.frameId === 0) { // Only handle main frame navigation
|
if (details.frameId === 0) {
|
||||||
console.log(`Main frame navigation detected for tab ${details.tabId}`);
|
console.log(`Main frame navigation detected for tab ${details.tabId}`);
|
||||||
await sendMessageToTab(details.tabId);
|
await sendMessageToTab(details.tabId);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,132 +1,32 @@
|
|||||||
console.log("Content script starting initialization...");
|
console.log("Content script starting initialization...");
|
||||||
|
|
||||||
// Function to log WebSocket state
|
function sendPageContent() {
|
||||||
function getWebSocketState(ws) {
|
const pageContent = {
|
||||||
const states = {
|
url: window.location.href,
|
||||||
0: 'CONNECTING',
|
html: document.documentElement.outerHTML,
|
||||||
1: 'OPEN',
|
timestamp: new Date().toISOString().replace(/\.\d{3}Z$/, 'Z')
|
||||||
2: 'CLOSING',
|
|
||||||
3: 'CLOSED'
|
|
||||||
};
|
};
|
||||||
return states[ws.readyState] || 'UNKNOWN';
|
|
||||||
|
browser.runtime.sendMessage({
|
||||||
|
type: "SEND_PAGE_CONTENT",
|
||||||
|
data: pageContent
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
class WebSocketClient {
|
// Listen for messages from the background script
|
||||||
constructor() {
|
|
||||||
console.log("WebSocketClient constructor called");
|
|
||||||
this.messageQueue = [];
|
|
||||||
this.connect();
|
|
||||||
this.reconnectAttempts = 0;
|
|
||||||
this.maxReconnectAttempts = 5;
|
|
||||||
}
|
|
||||||
|
|
||||||
connect() {
|
|
||||||
console.log('Attempting to connect to WebSocket server...');
|
|
||||||
try {
|
|
||||||
this.ws = new WebSocket('ws://localhost:8523/ws');
|
|
||||||
console.log('WebSocket instance created');
|
|
||||||
|
|
||||||
this.ws.addEventListener('open', () => {
|
|
||||||
console.log('WebSocket connection opened successfully');
|
|
||||||
this.reconnectAttempts = 0;
|
|
||||||
// Process any queued messages
|
|
||||||
this.processQueue();
|
|
||||||
});
|
|
||||||
|
|
||||||
this.ws.addEventListener('error', (event) => {
|
|
||||||
console.error('WebSocket error occurred:', event);
|
|
||||||
});
|
|
||||||
|
|
||||||
this.ws.addEventListener('close', (event) => {
|
|
||||||
console.log('WebSocket connection closed:', event.code, event.reason);
|
|
||||||
this.tryReconnect();
|
|
||||||
});
|
|
||||||
|
|
||||||
this.ws.addEventListener('message', (event) => {
|
|
||||||
console.log('Received message from server:', event.data);
|
|
||||||
});
|
|
||||||
} catch (error) {
|
|
||||||
console.error('Error creating WebSocket:', error);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
processQueue() {
|
|
||||||
console.log(`Processing message queue (${this.messageQueue.length} messages)`);
|
|
||||||
while (this.messageQueue.length > 0) {
|
|
||||||
const data = this.messageQueue.shift();
|
|
||||||
this.sendMessage(data);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
tryReconnect() {
|
|
||||||
if (this.reconnectAttempts < this.maxReconnectAttempts) {
|
|
||||||
this.reconnectAttempts++;
|
|
||||||
console.log(`Attempting to reconnect (${this.reconnectAttempts}/${this.maxReconnectAttempts})...`);
|
|
||||||
setTimeout(() => this.connect(), 2000 * this.reconnectAttempts);
|
|
||||||
} else {
|
|
||||||
console.log('Max reconnection attempts reached');
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
sendMessage(data) {
|
|
||||||
console.log('sendMessage called, WebSocket state:', getWebSocketState(this.ws));
|
|
||||||
if (this.ws.readyState === WebSocket.OPEN) {
|
|
||||||
try {
|
|
||||||
console.log('Preparing to send data:', {
|
|
||||||
url: data.url,
|
|
||||||
timestamp: data.timestamp,
|
|
||||||
htmlLength: data.html.length
|
|
||||||
});
|
|
||||||
this.ws.send(JSON.stringify(data));
|
|
||||||
console.log('Data sent successfully');
|
|
||||||
return true;
|
|
||||||
} catch (error) {
|
|
||||||
console.error('Error sending data:', error);
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
console.log('WebSocket not ready, queueing message');
|
|
||||||
this.messageQueue.push(data);
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
console.log("Creating WebSocketClient instance...");
|
|
||||||
const wsClient = new WebSocketClient();
|
|
||||||
|
|
||||||
console.log("Setting up message listener...");
|
|
||||||
browser.runtime.onMessage.addListener((message, sender, sendResponse) => {
|
browser.runtime.onMessage.addListener((message, sender, sendResponse) => {
|
||||||
console.log('Message received from background script:', message);
|
|
||||||
|
|
||||||
if (message.type === "PING") {
|
if (message.type === "PING") {
|
||||||
console.log('Received PING, responding...');
|
|
||||||
return Promise.resolve({ status: "ready" });
|
return Promise.resolve({ status: "ready" });
|
||||||
}
|
}
|
||||||
|
|
||||||
if (message.type === "GET_PAGE_CONTENT") {
|
if (message.type === "GET_PAGE_CONTENT") {
|
||||||
console.log('Processing GET_PAGE_CONTENT message');
|
sendPageContent();
|
||||||
const pageContent = {
|
|
||||||
url: window.location.href,
|
|
||||||
html: document.documentElement.outerHTML,
|
|
||||||
timestamp: new Date().toISOString().replace(/\.\d{3}Z$/, 'Z')
|
|
||||||
};
|
|
||||||
|
|
||||||
console.log('Created page content object for:', pageContent.url);
|
|
||||||
wsClient.sendMessage(pageContent);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
});
|
});
|
||||||
|
|
||||||
// Send initial page content
|
// Send initial page content
|
||||||
console.log('Sending initial page content...');
|
sendPageContent();
|
||||||
const pageContent = {
|
|
||||||
url: window.location.href,
|
|
||||||
html: document.documentElement.outerHTML,
|
|
||||||
timestamp: new Date().toISOString().replace(/\.\d{3}Z$/, 'Z')
|
|
||||||
};
|
|
||||||
|
|
||||||
wsClient.sendMessage(pageContent);
|
|
||||||
|
|
||||||
console.log("Content script initialization complete for:", window.location.href);
|
console.log("Content script initialization complete for:", window.location.href);
|
||||||
@@ -1,84 +0,0 @@
|
|||||||
import httpx
|
|
||||||
import re
|
|
||||||
from markdownify import markdownify as md
|
|
||||||
from bs4 import BeautifulSoup
|
|
||||||
|
|
||||||
# Patterns for cleaning
|
|
||||||
SCRIPT_PATTERN = r"<[ ]*script.*?\/[ ]*script[ ]*>"
|
|
||||||
STYLE_PATTERN = r"<[ ]*style.*?\/[ ]*style[ ]*>"
|
|
||||||
META_PATTERN = r"<[ ]*meta.*?>"
|
|
||||||
COMMENT_PATTERN = r"<[ ]*!--.*?--[ ]*>"
|
|
||||||
LINK_PATTERN = r"<[ ]*link.*?>"
|
|
||||||
BASE64_IMG_PATTERN = r'<img[^>]+src="data:image/[^;]+;base64,[^"]+"[^>]*>'
|
|
||||||
SVG_PATTERN = r"(<svg[^>]*>)(.*?)(<\/svg>)"
|
|
||||||
|
|
||||||
def clean_html(html: str) -> str:
|
|
||||||
"""Clean HTML by removing unwanted elements and patterns."""
|
|
||||||
# First use regex to remove problematic patterns
|
|
||||||
html = re.sub(SCRIPT_PATTERN, "", html, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL)
|
|
||||||
html = re.sub(STYLE_PATTERN, "", html, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL)
|
|
||||||
html = re.sub(META_PATTERN, "", html, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL)
|
|
||||||
html = re.sub(COMMENT_PATTERN, "", html, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL)
|
|
||||||
html = re.sub(LINK_PATTERN, "", html, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL)
|
|
||||||
html = re.sub(SVG_PATTERN, "", html, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL)
|
|
||||||
html = re.sub(BASE64_IMG_PATTERN, "", html)
|
|
||||||
|
|
||||||
# Use BeautifulSoup to remove additional elements we want to strip
|
|
||||||
soup = BeautifulSoup(html, 'html.parser')
|
|
||||||
|
|
||||||
# Remove unwanted elements
|
|
||||||
elements_to_remove = [
|
|
||||||
'canvas', 'img', 'picture', 'audio', 'video',
|
|
||||||
'iframe', 'embed', 'object', 'param', 'track',
|
|
||||||
'map', 'area', 'source'
|
|
||||||
]
|
|
||||||
|
|
||||||
for element in elements_to_remove:
|
|
||||||
for tag in soup.find_all(element):
|
|
||||||
tag.decompose()
|
|
||||||
|
|
||||||
return str(soup)
|
|
||||||
|
|
||||||
def get_page_html(url: str) -> str:
|
|
||||||
"""Fetch HTML content from a given URL using httpx."""
|
|
||||||
headers = {
|
|
||||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
|
|
||||||
}
|
|
||||||
try:
|
|
||||||
with httpx.Client(follow_redirects=True) as client:
|
|
||||||
response = client.get(url, headers=headers)
|
|
||||||
response.raise_for_status()
|
|
||||||
return response.text
|
|
||||||
except httpx.HTTPError as e:
|
|
||||||
print(f"Error fetching page: {e}")
|
|
||||||
return ""
|
|
||||||
|
|
||||||
def clean_whitespace(text: str) -> str:
|
|
||||||
"""Clean excessive whitespace from text, collapsing more than 2 newlines."""
|
|
||||||
# Replace 3 or more newlines with 2 newlines
|
|
||||||
cleaned = re.sub(r'\n{3,}', '\n\n', text)
|
|
||||||
# Remove trailing whitespace from each line
|
|
||||||
cleaned = '\n'.join(line.rstrip() for line in cleaned.splitlines())
|
|
||||||
return cleaned.strip()
|
|
||||||
|
|
||||||
def html_to_markdown(url: str) -> str:
|
|
||||||
"""Convert webpage HTML to markdown."""
|
|
||||||
html = get_page_html(url)
|
|
||||||
if not html:
|
|
||||||
return ""
|
|
||||||
|
|
||||||
# Clean the HTML first
|
|
||||||
cleaned_html = clean_html(html)
|
|
||||||
|
|
||||||
# Convert to markdown using markdownify
|
|
||||||
# Configure markdownify options for clean output
|
|
||||||
markdown = md(cleaned_html,
|
|
||||||
heading_style="ATX", # Use # style headers
|
|
||||||
bullets="-", # Use - for bullets
|
|
||||||
autolinks=True, # Convert URLs to links
|
|
||||||
strip=['form'], # Additional elements to strip
|
|
||||||
escape_asterisks=True,
|
|
||||||
escape_underscores=True)
|
|
||||||
|
|
||||||
# Clean up excessive whitespace
|
|
||||||
return clean_whitespace(markdown)
|
|
||||||
@@ -2,9 +2,10 @@ fastapi
|
|||||||
uvicorn
|
uvicorn
|
||||||
sqlalchemy
|
sqlalchemy
|
||||||
browser-history
|
browser-history
|
||||||
beautifulsoup4
|
beautifulsoup4>=4.9.3
|
||||||
markdownify
|
markdownify
|
||||||
pyyaml
|
pyyaml>=6.0.1
|
||||||
pytz
|
pytz
|
||||||
websockets==11.0.3
|
websockets==11.0.3
|
||||||
iso8601==2.1.0
|
iso8601==2.1.0
|
||||||
|
lxml>=4.9.3
|
||||||
Reference in New Issue
Block a user