Historical records use original timestamp

This commit is contained in:
2025-04-12 21:13:21 -05:00
parent 93063d2465
commit bfc7f4dd35
3 changed files with 9 additions and 6 deletions

View File

@@ -1,9 +1,10 @@
from typing import Tuple from typing import Tuple, Optional
from urllib.parse import urlparse from urllib.parse import urlparse
from database import Database from database import Database
from domain_exclusions import DomainExclusions from domain_exclusions import DomainExclusions
from logger import Logger from logger import Logger
from crawl4ai import AsyncWebCrawler from crawl4ai import AsyncWebCrawler
from datetime import datetime
class BaseCrawler: class BaseCrawler:
def __init__(self, db: Database, domain_exclusions: DomainExclusions, logger: Logger): def __init__(self, db: Database, domain_exclusions: DomainExclusions, logger: Logger):
@@ -34,7 +35,7 @@ class BaseCrawler:
return False, "" return False, ""
async def crawl_url(self, url: str, default_title: str = None) -> Tuple[bool, dict]: async def crawl_url(self, url: str, default_title: str = None, created_timestamp: Optional[datetime] = None) -> Tuple[bool, dict]:
try: try:
result = await self.crawler.arun(url=url) result = await self.crawler.arun(url=url)
crawl_result = result[0] crawl_result = result[0]
@@ -44,7 +45,8 @@ class BaseCrawler:
self.db.add_history( self.db.add_history(
url=url, url=url,
title=title, title=title,
content=content content=content,
created_timestamp=created_timestamp
) )
return True, { return True, {

View File

@@ -59,14 +59,15 @@ class Database:
self.conn.commit() self.conn.commit()
def add_history(self, url: str, title: str, content: str) -> int: def add_history(self, url: str, title: str, content: str, created_timestamp: Optional[datetime] = None) -> int:
"""Add a new history entry.""" """Add a new history entry."""
now = datetime.utcnow() now = datetime.utcnow()
created_time = created_timestamp if created_timestamp else now
with self._lock: with self._lock:
self.cursor.execute(''' self.cursor.execute('''
INSERT INTO history (url, title, content, created, updated) INSERT INTO history (url, title, content, created, updated)
VALUES (?, ?, ?, ?, ?) VALUES (?, ?, ?, ?, ?)
''', (url, title, content, now, now)) ''', (url, title, content, created_time, now))
self.conn.commit() self.conn.commit()
return self.cursor.lastrowid return self.cursor.lastrowid

View File

@@ -63,7 +63,7 @@ class HistoryCrawler(BaseCrawler):
self.logger.info(f"Skipping URL from history: {url} ({skip_reason})") self.logger.info(f"Skipping URL from history: {url} ({skip_reason})")
continue continue
success, result = await self.crawl_url(url, title) success, result = await self.crawl_url(url, title, created_timestamp=timestamp)
if success: if success:
self.logger.info(f"Processed historical URL: {url}") self.logger.info(f"Processed historical URL: {url}")