diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..d4c3740 --- /dev/null +++ b/.env.example @@ -0,0 +1,6 @@ +# Meilisearch Configuration +MEILISEARCH_HOST=http://localhost:7700 +# Generate a master key using: openssl rand -hex 32 +MEILISEARCH_MASTER_KEY=your_master_key_here + +# Example master key: 6d99b335033595ea62d02a5641b94e04e80c33c1e1f1f789c84445ff5 \ No newline at end of file diff --git a/app/__init__.py b/app/__init__.py new file mode 100644 index 0000000..e188c78 --- /dev/null +++ b/app/__init__.py @@ -0,0 +1 @@ +# This file can be empty, it just marks the directory as a Python package \ No newline at end of file diff --git a/app/config.py b/app/config.py new file mode 100644 index 0000000..31ba229 --- /dev/null +++ b/app/config.py @@ -0,0 +1,92 @@ +import yaml +from pathlib import Path +from typing import Set +import fnmatch + +class ReaderConfig: + def __init__(self): + self.excluded_patterns: Set[str] = set() + self._load_config() + + def _load_config(self): + config_path = Path("config/reader_config.yaml") + if not config_path.exists(): + print("Warning: reader_config.yaml not found, creating default config") + self._create_default_config(config_path) + + try: + with open(config_path, 'r') as f: + config = yaml.safe_load(f) + self.excluded_patterns = set(config.get('excluded_domains', [])) + except Exception as e: + print(f"Error loading config: {e}") + self.excluded_patterns = set() + + def _create_default_config(self, config_path: Path): + config_path.parent.mkdir(parents=True, exist_ok=True) + default_config = { + 'excluded_domains': [ + 'localhost', + '127.0.0.1', + '192.168.*.*', + '10.*.*.*' + ] + } + with open(config_path, 'w') as f: + yaml.safe_dump(default_config, f, default_flow_style=False) + + def is_domain_excluded(self, domain: str) -> bool: + """ + Check if a domain matches any exclusion pattern. + Supports glob-style wildcards (* and ?) + Examples: + - '*.example.com' matches any subdomain of example.com + - 'reddit-*.com' matches reddit-video.com, reddit-static.com, etc. + - '192.168.*.*' matches any IP in the 192.168.0.0/16 subnet + """ + domain = domain.lower() + + # Check each pattern + for pattern in self.excluded_patterns: + pattern = pattern.lower() + + # Handle IP address patterns specially + if any(c.isdigit() for c in pattern): + if self._match_ip_pattern(domain, pattern): + return True + + # Handle domain patterns + if fnmatch.fnmatch(domain, pattern): + return True + # Also check if the pattern matches when prepended with a dot + # This handles cases like 'example.com' matching 'subdomain.example.com' + if fnmatch.fnmatch(domain, f"*.{pattern}"): + return True + + return False + + def _match_ip_pattern(self, domain: str, pattern: str) -> bool: + """ + Special handling for IP address patterns. + Handles cases like '192.168.*.*' matching '192.168.1.1' + """ + # Skip if domain isn't IP-like + if not any(c.isdigit() for c in domain): + return False + + # Split into octets + domain_parts = domain.split('.') + pattern_parts = pattern.split('.') + + # Must have same number of parts + if len(domain_parts) != len(pattern_parts): + return False + + # Check each octet + for domain_part, pattern_part in zip(domain_parts, pattern_parts): + if pattern_part == '*': + continue + if domain_part != pattern_part: + return False + + return True \ No newline at end of file diff --git a/app/database.py b/app/database.py index 391fe80..c8edecf 100644 --- a/app/database.py +++ b/app/database.py @@ -1,6 +1,7 @@ -from sqlalchemy import create_engine, Column, Integer, String, DateTime +from sqlalchemy import create_engine, Column, Integer, String, DateTime, Text from sqlalchemy.ext.declarative import declarative_base from sqlalchemy.orm import sessionmaker +from datetime import datetime SQLALCHEMY_DATABASE_URL = "sqlite:///./browser_history.db" @@ -10,13 +11,15 @@ SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine) Base = declarative_base() class HistoryEntry(Base): - __tablename__ = "history_entries" + __tablename__ = "history" - id = Column(Integer, primary_key=True, index=True) - url = Column(String, index=True) - title = Column(String, nullable=True) - visit_time = Column(DateTime, index=True) - domain = Column(String, index=True) + id = Column(Integer, primary_key=True) + url = Column(String) + title = Column(String) + visit_time = Column(DateTime) + domain = Column(String) + markdown_content = Column(Text, nullable=True) + last_content_update = Column(DateTime, nullable=True) class Bookmark(Base): __tablename__ = "bookmarks" @@ -28,6 +31,37 @@ class Bookmark(Base): folder = Column(String, index=True) domain = Column(String, index=True) +class BlacklistedDomain(Base): + __tablename__ = "blacklisted_domains" + + id = Column(Integer, primary_key=True) + domain = Column(String, unique=True, index=True) + reason = Column(String, nullable=True) + added_time = Column(DateTime, default=datetime.utcnow) + + @classmethod + def is_blacklisted(cls, db: SessionLocal, domain: str) -> bool: + """Check if a domain is blacklisted""" + return db.query(cls).filter(cls.domain == domain.lower()).first() is not None + + @classmethod + def add_to_blacklist(cls, db: SessionLocal, domain: str, reason: str = None): + """Add a domain to the blacklist""" + try: + blacklist_entry = cls( + domain=domain.lower(), + reason=reason + ) + db.add(blacklist_entry) + db.commit() + except: + db.rollback() + # If entry already exists, just update the reason + existing = db.query(cls).filter(cls.domain == domain.lower()).first() + if existing and reason: + existing.reason = reason + db.commit() + Base.metadata.create_all(bind=engine) def get_db(): diff --git a/app/main.py b/app/main.py index 13f6feb..dcf2866 100644 --- a/app/main.py +++ b/app/main.py @@ -1,15 +1,31 @@ -from fastapi import FastAPI, Depends, Query +from fastapi import FastAPI, Depends, Query, WebSocket, WebSocketDisconnect from sqlalchemy.orm import Session -from datetime import datetime -from typing import List +from datetime import datetime, timezone +from typing import List, Optional import asyncio +from fastapi import WebSocketDisconnect +from urllib.parse import urlparse +import pytz +from fastapi.middleware.cors import CORSMiddleware +import iso8601 from .database import get_db, HistoryEntry, Bookmark from .scheduler import HistoryScheduler +from .page_info import PageInfo +from .page_reader import PageReader app = FastAPI() scheduler = HistoryScheduler() +# Add CORS middleware to allow WebSocket connections +app.add_middleware( + CORSMiddleware, + allow_origins=["*"], # In production, specify your domains + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], +) + @app.on_event("startup") async def startup_event(): # Initial bookmark fetch @@ -17,12 +33,37 @@ async def startup_event(): # Start the background task asyncio.create_task(scheduler.update_history()) +def serialize_history_entry(entry, include_content: bool = False): + """Serialize a HistoryEntry object to a dictionary""" + result = { + "id": entry.id, + "url": entry.url, + "title": entry.title, + "visit_time": entry.visit_time.isoformat() if entry.visit_time else None, + "domain": entry.domain, + } + if include_content: + result["markdown_content"] = entry.markdown_content + return result + +def serialize_bookmark(bookmark): + """Serialize a Bookmark object to a dictionary""" + return { + "id": bookmark.id, + "url": bookmark.url, + "title": bookmark.title, + "added_time": bookmark.added_time.isoformat() if bookmark.added_time else None, + "folder": bookmark.folder, + "domain": bookmark.domain, + } + @app.get("/history/search") async def search_history( - domain: str = Query(None), - start_date: datetime = Query(None), - end_date: datetime = Query(None), - search_term: str = Query(None), + domain: Optional[str] = Query(None), + start_date: Optional[datetime] = Query(None), + end_date: Optional[datetime] = Query(None), + search_term: Optional[str] = Query(None), + include_content: bool = Query(False), db: Session = Depends(get_db) ): query = db.query(HistoryEntry) @@ -37,15 +78,19 @@ async def search_history( query = query.filter(HistoryEntry.visit_time <= end_date) if search_term: - query = query.filter(HistoryEntry.title.ilike(f"%{search_term}%")) + query = query.filter( + (HistoryEntry.title.ilike(f"%{search_term}%")) | + (HistoryEntry.markdown_content.ilike(f"%{search_term}%")) + ) - return query.all() + entries = query.all() + return [serialize_history_entry(entry, include_content) for entry in entries] @app.get("/bookmarks/search") async def search_bookmarks( - domain: str = Query(None), - folder: str = Query(None), - search_term: str = Query(None), + domain: Optional[str] = Query(None), + folder: Optional[str] = Query(None), + search_term: Optional[str] = Query(None), db: Session = Depends(get_db) ): query = db.query(Bookmark) @@ -59,4 +104,73 @@ async def search_bookmarks( if search_term: query = query.filter(Bookmark.title.ilike(f"%{search_term}%")) - return query.all() \ No newline at end of file + bookmarks = query.all() + return [serialize_bookmark(bookmark) for bookmark in bookmarks] + +@app.websocket("/ws") +async def websocket_endpoint(websocket: WebSocket, db: Session = Depends(get_db)): + print("WebSocket endpoint called") + page_reader = PageReader() + print("New WebSocket connection established") + await websocket.accept() + print("WebSocket connection accepted") + try: + while True: + print("Waiting for message...") + data = await websocket.receive_json() + print(f"Received message for URL: {data['url']}") + print(f"HTML content length: {len(data['html'])}") + print(f"Timestamp: {data['timestamp']}") + + # Parse the ISO timestamp correctly + timestamp = iso8601.parse_date(data['timestamp']) + + page_info = PageInfo( + url=data['url'], + html=data['html'], + timestamp=timestamp + ) + print(f"Created PageInfo object for: {page_info.url}") + + # Convert HTML to markdown + print("Converting HTML to markdown...") + markdown_content = page_reader.html_to_markdown(page_info.html) + print(f"Markdown conversion complete, length: {len(markdown_content) if markdown_content else 0}") + + # Update or create history entry + domain = urlparse(page_info.url).netloc + print(f"Creating history entry for domain: {domain}") + history_entry = HistoryEntry( + url=page_info.url, + visit_time=page_info.timestamp, + domain=domain, + markdown_content=markdown_content, + last_content_update=datetime.now(timezone.utc) + ) + + print("Saving to database...") + db.add(history_entry) + db.commit() + print("Database save complete") + + # Send confirmation back to client + await websocket.send_json({ + "status": "success", + "message": f"Processed page: {page_info.url}" + }) + + except WebSocketDisconnect: + print("Client disconnected") + except Exception as e: + print(f"Error handling message: {e}") + # Send error back to client if possible + try: + await websocket.send_json({ + "status": "error", + "message": str(e) + }) + except: + pass + finally: + print("Cleaning up resources") + page_reader.close() \ No newline at end of file diff --git a/app/page_info.py b/app/page_info.py index 404104f..5e5ae4b 100644 --- a/app/page_info.py +++ b/app/page_info.py @@ -1,16 +1,8 @@ -import asyncio -import aiohttp -from bs4 import BeautifulSoup -from typing import Optional +from dataclasses import dataclass +from datetime import datetime -class PageInfoFetcher: - async def get_page_title(self, url: str) -> Optional[str]: - try: - async with aiohttp.ClientSession() as session: - async with session.get(url, timeout=5) as response: - if response.status == 200: - html = await response.text() - soup = BeautifulSoup(html, 'html.parser') - return soup.title.string if soup.title else None - except: - return None \ No newline at end of file +@dataclass +class PageInfo: + url: str + html: str + timestamp: datetime \ No newline at end of file diff --git a/app/page_reader.py b/app/page_reader.py new file mode 100644 index 0000000..e2d8175 --- /dev/null +++ b/app/page_reader.py @@ -0,0 +1,101 @@ +import re +from markdownify import markdownify as md +from bs4 import BeautifulSoup +from typing import Optional +from urllib.parse import urlparse +from .config import ReaderConfig +import logging +from .database import SessionLocal, BlacklistedDomain + +# Setup logging with less verbose output +logging.basicConfig( + level=logging.WARNING, + format='%(levelname)s: %(message)s' +) +logger = logging.getLogger(__name__) + +# Patterns for cleaning +SCRIPT_PATTERN = r"<[ ]*script.*?\/[ ]*script[ ]*>" +STYLE_PATTERN = r"<[ ]*style.*?\/[ ]*style[ ]*>" +META_PATTERN = r"<[ ]*meta.*?>" +COMMENT_PATTERN = r"<[ ]*!--.*?--[ ]*>" +LINK_PATTERN = r"<[ ]*link.*?>" +BASE64_IMG_PATTERN = r']+src="data:image/[^;]+;base64,[^"]+"[^>]*>' +SVG_PATTERN = r"(]*>)(.*?)(<\/svg>)" + +class PageReader: + def __init__(self): + self.config = ReaderConfig() + self.db = SessionLocal() + + def clean_html(self, html: str) -> str: + """Clean HTML by removing unwanted elements and patterns.""" + if not html: + return "" + + # First use regex to remove problematic patterns + html = re.sub(SCRIPT_PATTERN, "", html, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL) + html = re.sub(STYLE_PATTERN, "", html, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL) + html = re.sub(META_PATTERN, "", html, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL) + html = re.sub(COMMENT_PATTERN, "", html, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL) + html = re.sub(LINK_PATTERN, "", html, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL) + html = re.sub(SVG_PATTERN, "", html, flags=re.IGNORECASE | re.MULTILINE | re.DOTALL) + html = re.sub(BASE64_IMG_PATTERN, "", html) + + try: + # Use BeautifulSoup to remove additional elements we want to strip + soup = BeautifulSoup(html, 'html.parser') + + # Remove unwanted elements + elements_to_remove = [ + 'canvas', 'img', 'picture', 'audio', 'video', + 'iframe', 'embed', 'object', 'param', 'track', + 'map', 'area', 'source' + ] + + for element in elements_to_remove: + for tag in soup.find_all(element): + tag.decompose() + + return str(soup) + except Exception as e: + logger.error(f"Error cleaning HTML: {e}") + return "" + + def clean_whitespace(self, text: str) -> str: + """Clean excessive whitespace from text.""" + if not text: + return "" + + try: + # Replace 3 or more newlines with 2 newlines + cleaned = re.sub(r'\n{3,}', '\n\n', text) + # Remove trailing whitespace from each line + cleaned = '\n'.join(line.rstrip() for line in cleaned.splitlines()) + return cleaned.strip() + except Exception as e: + logger.error(f"Error cleaning whitespace: {e}") + return "" + + def html_to_markdown(self, html: str) -> Optional[str]: + """Convert HTML to markdown.""" + try: + cleaned_html = self.clean_html(html) + if not cleaned_html: + return None + + return self.clean_whitespace(md(cleaned_html, + heading_style="ATX", + bullets="-", + autolinks=True, + strip=['form'], + escape_asterisks=True, + escape_underscores=True)) + + except Exception as e: + logger.error(f"Error converting to markdown: {e}") + return None + + def close(self): + """Cleanup resources""" + self.db.close() \ No newline at end of file diff --git a/app/scheduler.py b/app/scheduler.py index e966dca..d80d79c 100644 --- a/app/scheduler.py +++ b/app/scheduler.py @@ -3,14 +3,29 @@ from datetime import datetime, timedelta import asyncio from .database import SessionLocal, HistoryEntry, Bookmark from .browser import BrowserHistoryCollector -from .page_info import PageInfoFetcher +from .page_reader import PageReader from sqlalchemy import func +from sqlalchemy.orm import Session +import pytz class HistoryScheduler: def __init__(self): self.browser_collector = BrowserHistoryCollector() - self.page_fetcher = PageInfoFetcher() + self.page_reader = PageReader() self.last_history_update = None + self.content_update_interval = timedelta(hours=24) # Update content daily + + def _normalize_datetime(self, dt: datetime) -> datetime: + """Convert datetime to UTC if it has timezone, or make it timezone-aware if it doesn't""" + if dt is None: + return None + + # If datetime is naive (no timezone), assume it's in UTC + if dt.tzinfo is None: + return pytz.UTC.localize(dt) + + # If datetime has timezone, convert to UTC + return dt.astimezone(pytz.UTC) async def update_bookmarks(self): bookmarks = self.browser_collector.fetch_bookmarks() @@ -26,6 +41,9 @@ class HistoryScheduler: new_entries = [] for added_time, url, title, folder in bookmarks: + # Normalize the datetime + added_time = self._normalize_datetime(added_time) + # Only add if URL doesn't exist or if it's in a different folder if (url not in existing_urls or existing_urls[url][1] != folder): @@ -51,6 +69,8 @@ class HistoryScheduler: try: # Get the latest timestamp from our database latest_entry = db.query(func.max(HistoryEntry.visit_time)).scalar() + if latest_entry: + latest_entry = self._normalize_datetime(latest_entry) # Fetch new history history = self.browser_collector.fetch_history() @@ -58,11 +78,11 @@ class HistoryScheduler: # Filter to only get entries newer than our latest entry new_entries = [] for visit_time, url, title in history: + # Normalize the datetime + visit_time = self._normalize_datetime(visit_time) + if not latest_entry or visit_time > latest_entry: domain = self.browser_collector.get_domain(url) - if not title: - title = await self.page_fetcher.get_page_title(url) - entry = HistoryEntry( url=url, title=title, @@ -82,4 +102,8 @@ class HistoryScheduler: db.close() # Wait for 5 minutes before next update - await asyncio.sleep(300) \ No newline at end of file + await asyncio.sleep(300) + + async def close(self): + """Cleanup resources""" + await self.page_reader.close() \ No newline at end of file diff --git a/app/websocket_server.py b/app/websocket_server.py new file mode 100644 index 0000000..cf853ef --- /dev/null +++ b/app/websocket_server.py @@ -0,0 +1,33 @@ +import asyncio +import websockets +import json +from page_info import PageInfo +from datetime import datetime + +async def handle_websocket(websocket, path): + try: + async for message in websocket: + data = json.loads(message) + page_info = PageInfo( + url=data['url'], + html=data['html'], + timestamp=datetime.fromisoformat(data['timestamp']) + ) + print(f"Received page content from: {page_info.url}") + # Here you can process the page_info object as needed + + except websockets.exceptions.ConnectionClosed: + print("Client disconnected") + except Exception as e: + print(f"Error handling message: {e}") + +async def start_server(): + server = await websockets.serve(handle_websocket, "localhost", 8765) + print("WebSocket server started on ws://localhost:8765") + await server.wait_closed() + +def run_server(): + asyncio.run(start_server()) + +if __name__ == "__main__": + run_server() \ No newline at end of file diff --git a/config/reader_config.yaml b/config/reader_config.yaml new file mode 100644 index 0000000..f764257 --- /dev/null +++ b/config/reader_config.yaml @@ -0,0 +1,15 @@ +# Domains to exclude from content reading +excluded_domains: + # Local sites + - localhost + - 127.0.0.1 + + # IP ranges + - 192.168.*.* + - 10.*.*.* + - 172.16.*.* + + # Example wildcard patterns + # - *.local + # - reddit-*.com + # - *.githubusercontent.com diff --git a/extension/background.js b/extension/background.js new file mode 100644 index 0000000..c9cdb23 --- /dev/null +++ b/extension/background.js @@ -0,0 +1,47 @@ +console.log("Background script loaded"); + +async function isContentScriptReady(tabId) { + try { + await browser.tabs.sendMessage(tabId, { type: "PING" }); + return true; + } catch (error) { + return false; + } +} + +async function waitForContentScript(tabId, maxAttempts = 10) { + console.log(`Waiting for content script in tab ${tabId}`); + for (let i = 0; i < maxAttempts; i++) { + if (await isContentScriptReady(tabId)) { + console.log(`Content script ready in tab ${tabId}`); + return true; + } + console.log(`Attempt ${i + 1}: Content script not ready, waiting...`); + await new Promise(resolve => setTimeout(resolve, 500)); + } + console.log(`Content script not ready after ${maxAttempts} attempts`); + return false; +} + +async function sendMessageToTab(tabId) { + try { + console.log(`Checking content script status for tab ${tabId}`); + if (await waitForContentScript(tabId)) { + console.log(`Sending GET_PAGE_CONTENT message to tab ${tabId}`); + await browser.tabs.sendMessage(tabId, { + type: "GET_PAGE_CONTENT" + }); + console.log(`Successfully sent message to tab ${tabId}`); + } + } catch (error) { + console.error(`Error sending message to tab ${tabId}:`, error); + } +} + +browser.webNavigation.onCompleted.addListener(async (details) => { + console.log("Navigation completed", details); + if (details.frameId === 0) { // Only handle main frame navigation + console.log(`Main frame navigation detected for tab ${details.tabId}`); + await sendMessageToTab(details.tabId); + } +}); \ No newline at end of file diff --git a/extension/content.js b/extension/content.js new file mode 100644 index 0000000..f669d75 --- /dev/null +++ b/extension/content.js @@ -0,0 +1,132 @@ +console.log("Content script starting initialization..."); + +// Function to log WebSocket state +function getWebSocketState(ws) { + const states = { + 0: 'CONNECTING', + 1: 'OPEN', + 2: 'CLOSING', + 3: 'CLOSED' + }; + return states[ws.readyState] || 'UNKNOWN'; +} + +class WebSocketClient { + constructor() { + console.log("WebSocketClient constructor called"); + this.messageQueue = []; + this.connect(); + this.reconnectAttempts = 0; + this.maxReconnectAttempts = 5; + } + + connect() { + console.log('Attempting to connect to WebSocket server...'); + try { + this.ws = new WebSocket('ws://localhost:8523/ws'); + console.log('WebSocket instance created'); + + this.ws.addEventListener('open', () => { + console.log('WebSocket connection opened successfully'); + this.reconnectAttempts = 0; + // Process any queued messages + this.processQueue(); + }); + + this.ws.addEventListener('error', (event) => { + console.error('WebSocket error occurred:', event); + }); + + this.ws.addEventListener('close', (event) => { + console.log('WebSocket connection closed:', event.code, event.reason); + this.tryReconnect(); + }); + + this.ws.addEventListener('message', (event) => { + console.log('Received message from server:', event.data); + }); + } catch (error) { + console.error('Error creating WebSocket:', error); + } + } + + processQueue() { + console.log(`Processing message queue (${this.messageQueue.length} messages)`); + while (this.messageQueue.length > 0) { + const data = this.messageQueue.shift(); + this.sendMessage(data); + } + } + + tryReconnect() { + if (this.reconnectAttempts < this.maxReconnectAttempts) { + this.reconnectAttempts++; + console.log(`Attempting to reconnect (${this.reconnectAttempts}/${this.maxReconnectAttempts})...`); + setTimeout(() => this.connect(), 2000 * this.reconnectAttempts); + } else { + console.log('Max reconnection attempts reached'); + } + } + + sendMessage(data) { + console.log('sendMessage called, WebSocket state:', getWebSocketState(this.ws)); + if (this.ws.readyState === WebSocket.OPEN) { + try { + console.log('Preparing to send data:', { + url: data.url, + timestamp: data.timestamp, + htmlLength: data.html.length + }); + this.ws.send(JSON.stringify(data)); + console.log('Data sent successfully'); + return true; + } catch (error) { + console.error('Error sending data:', error); + return false; + } + } else { + console.log('WebSocket not ready, queueing message'); + this.messageQueue.push(data); + return true; + } + } +} + +console.log("Creating WebSocketClient instance..."); +const wsClient = new WebSocketClient(); + +console.log("Setting up message listener..."); +browser.runtime.onMessage.addListener((message, sender, sendResponse) => { + console.log('Message received from background script:', message); + + if (message.type === "PING") { + console.log('Received PING, responding...'); + return Promise.resolve({ status: "ready" }); + } + + if (message.type === "GET_PAGE_CONTENT") { + console.log('Processing GET_PAGE_CONTENT message'); + const pageContent = { + url: window.location.href, + html: document.documentElement.outerHTML, + timestamp: new Date().toISOString().replace(/\.\d{3}Z$/, 'Z') + }; + + console.log('Created page content object for:', pageContent.url); + wsClient.sendMessage(pageContent); + } + + return true; +}); + +// Send initial page content +console.log('Sending initial page content...'); +const pageContent = { + url: window.location.href, + html: document.documentElement.outerHTML, + timestamp: new Date().toISOString().replace(/\.\d{3}Z$/, 'Z') +}; + +wsClient.sendMessage(pageContent); + +console.log("Content script initialization complete for:", window.location.href); \ No newline at end of file diff --git a/extension/manifest.json b/extension/manifest.json new file mode 100644 index 0000000..7450313 --- /dev/null +++ b/extension/manifest.json @@ -0,0 +1,35 @@ +{ + "manifest_version": 2, + "name": "Page Content Sender", + "version": "1.0", + "description": "Sends page content via WebSocket when a page loads", + "permissions": [ + "webNavigation", + "activeTab", + "", + "tabs" + ], + "background": { + "scripts": [ + "background.js" + ], + "persistent": true + }, + "content_scripts": [ + { + "matches": [ + "" + ], + "js": [ + "content.js" + ], + "run_at": "document_idle", + "all_frames": false + } + ], + "browser_specific_settings": { + "gecko": { + "id": "page-content-sender@example.com" + } + } +} \ No newline at end of file diff --git a/page-reader.py b/page-reader.py index 48f123e..ae83191 100644 --- a/page-reader.py +++ b/page-reader.py @@ -82,9 +82,3 @@ def html_to_markdown(url: str) -> str: # Clean up excessive whitespace return clean_whitespace(markdown) - -if __name__ == "__main__": - # Example usage - url = "https://reddit.com" - markdown_content = html_to_markdown(url) - print(markdown_content) diff --git a/requirements.txt b/requirements.txt index 22e864e..eb88162 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,8 +1,10 @@ -fastapi==0.109.2 -uvicorn==0.27.1 -sqlalchemy==2.0.27 -browser-history==0.4.1 -aiohttp==3.9.3 -beautifulsoup4==4.12.3 -httpx==0.27.0 -markdownify==0.11.6 \ No newline at end of file +fastapi +uvicorn +sqlalchemy +browser-history +beautifulsoup4 +markdownify +pyyaml +pytz +websockets==11.0.3 +iso8601==2.1.0 \ No newline at end of file diff --git a/terminal b/terminal new file mode 100644 index 0000000..613357b --- /dev/null +++ b/terminal @@ -0,0 +1 @@ +rm app/websocket_server.py \ No newline at end of file