Historical records use original timestamp

2026-03-16 02:49:05 +00:00 · 2025-04-12 21:13:21 -05:00
parent 93063d2465
commit bfc7f4dd35
3 changed files with 9 additions and 6 deletions
--- a/src/base_crawler.py
+++ b/src/base_crawler.py
@@ -1,9 +1,10 @@
-from typing import Tuple
+from typing import Tuple, Optional
 from urllib.parse import urlparse
 from database import Database
 from domain_exclusions import DomainExclusions
 from logger import Logger
 from crawl4ai import AsyncWebCrawler
+from datetime import datetime

 class BaseCrawler:
    def __init__(self, db: Database, domain_exclusions: DomainExclusions, logger: Logger):
@@ -34,7 +35,7 @@ class BaseCrawler:

        return False, ""

-    async def crawl_url(self, url: str, default_title: str = None) -> Tuple[bool, dict]:
+    async def crawl_url(self, url: str, default_title: str = None, created_timestamp: Optional[datetime] = None) -> Tuple[bool, dict]:
        try:
            result = await self.crawler.arun(url=url)
            crawl_result = result[0]
@@ -44,7 +45,8 @@ class BaseCrawler:
            self.db.add_history(
                url=url,
                title=title,
-                content=content
+                content=content,
+                created_timestamp=created_timestamp
            )

            return True, {
--- a/src/database.py
+++ b/src/database.py
@@ -59,14 +59,15 @@ class Database:

        self.conn.commit()

-    def add_history(self, url: str, title: str, content: str) -> int:
+    def add_history(self, url: str, title: str, content: str, created_timestamp: Optional[datetime] = None) -> int:
        """Add a new history entry."""
        now = datetime.utcnow()
+        created_time = created_timestamp if created_timestamp else now
        with self._lock:
            self.cursor.execute('''
                INSERT INTO history (url, title, content, created, updated)
                VALUES (?, ?, ?, ?, ?)
-            ''', (url, title, content, now, now))
+            ''', (url, title, content, created_time, now))
            self.conn.commit()
            return self.cursor.lastrowid

--- a/src/main.py
+++ b/src/main.py
@@ -63,7 +63,7 @@ class HistoryCrawler(BaseCrawler):
                self.logger.info(f"Skipping URL from history: {url} ({skip_reason})")
                continue

-            success, result = await self.crawl_url(url, title)
+            success, result = await self.crawl_url(url, title, created_timestamp=timestamp)
            if success:
                self.logger.info(f"Processed historical URL: {url}")