From bfc7f4dd3539bfc12ecf5b020896afea5cf14619 Mon Sep 17 00:00:00 2001 From: Zetaphor Date: Sat, 12 Apr 2025 21:13:21 -0500 Subject: [PATCH] Historical records use original timestamp --- src/base_crawler.py | 8 +++++--- src/database.py | 5 +++-- src/main.py | 2 +- 3 files changed, 9 insertions(+), 6 deletions(-) diff --git a/src/base_crawler.py b/src/base_crawler.py index b953ac7..8d944f7 100644 --- a/src/base_crawler.py +++ b/src/base_crawler.py @@ -1,9 +1,10 @@ -from typing import Tuple +from typing import Tuple, Optional from urllib.parse import urlparse from database import Database from domain_exclusions import DomainExclusions from logger import Logger from crawl4ai import AsyncWebCrawler +from datetime import datetime class BaseCrawler: def __init__(self, db: Database, domain_exclusions: DomainExclusions, logger: Logger): @@ -34,7 +35,7 @@ class BaseCrawler: return False, "" - async def crawl_url(self, url: str, default_title: str = None) -> Tuple[bool, dict]: + async def crawl_url(self, url: str, default_title: str = None, created_timestamp: Optional[datetime] = None) -> Tuple[bool, dict]: try: result = await self.crawler.arun(url=url) crawl_result = result[0] @@ -44,7 +45,8 @@ class BaseCrawler: self.db.add_history( url=url, title=title, - content=content + content=content, + created_timestamp=created_timestamp ) return True, { diff --git a/src/database.py b/src/database.py index ee94f7d..d86a498 100644 --- a/src/database.py +++ b/src/database.py @@ -59,14 +59,15 @@ class Database: self.conn.commit() - def add_history(self, url: str, title: str, content: str) -> int: + def add_history(self, url: str, title: str, content: str, created_timestamp: Optional[datetime] = None) -> int: """Add a new history entry.""" now = datetime.utcnow() + created_time = created_timestamp if created_timestamp else now with self._lock: self.cursor.execute(''' INSERT INTO history (url, title, content, created, updated) VALUES (?, ?, ?, ?, ?) - ''', (url, title, content, now, now)) + ''', (url, title, content, created_time, now)) self.conn.commit() return self.cursor.lastrowid diff --git a/src/main.py b/src/main.py index 77f46f7..f589d77 100644 --- a/src/main.py +++ b/src/main.py @@ -63,7 +63,7 @@ class HistoryCrawler(BaseCrawler): self.logger.info(f"Skipping URL from history: {url} ({skip_reason})") continue - success, result = await self.crawl_url(url, title) + success, result = await self.crawl_url(url, title, created_timestamp=timestamp) if success: self.logger.info(f"Processed historical URL: {url}")