diff --git a/.env b/.env index 970177f..8ac3db1 100644 --- a/.env +++ b/.env @@ -1 +1 @@ -CRAWL_INTERVAL=5 \ No newline at end of file +CRAWL_INTERVAL=3 \ No newline at end of file diff --git a/config/history_config.yaml b/config/history_config.yaml index cfb52a3..e458236 100644 --- a/config/history_config.yaml +++ b/config/history_config.yaml @@ -6,7 +6,7 @@ excluded_domains: - "namecheap.com" - "us.ovhcloud.com" - "linode.com" - - "github.com/settings/" + - "github.com/settings" - "secure.backblaze.com" - "login." - "identity." diff --git a/run-browser-recall.fish b/run-browser-recall.fish index bbf2edb..e4cd2e0 100755 --- a/run-browser-recall.fish +++ b/run-browser-recall.fish @@ -8,4 +8,4 @@ source (dirname (status filename))/.venv/bin/activate.fish python src/main.py > /dev/null 2>&1 & # Print a simple confirmation message -echo "Browser Recall started in background" \ No newline at end of file +echo "Browser Recall started in background with PID $!" \ No newline at end of file diff --git a/src/base_crawler.py b/src/base_crawler.py index 34f5934..b953ac7 100644 --- a/src/base_crawler.py +++ b/src/base_crawler.py @@ -24,11 +24,9 @@ class BaseCrawler: if url.startswith("about:") or url.startswith("chrome:"): return True, "Browser internal URL" - domain = urlparse(url).netloc - - # Check domain exclusions - if self.domain_exclusions.is_excluded(domain): - return True, "Excluded domain" + # Check domain exclusions using the full URL, not just the domain + if self.domain_exclusions.is_excluded(url): + return True, "Excluded domain/path" # Check if URL exists if self.db.url_exists(url): diff --git a/src/domain_exclusions.py b/src/domain_exclusions.py index d223986..5bb83d1 100644 --- a/src/domain_exclusions.py +++ b/src/domain_exclusions.py @@ -41,17 +41,74 @@ class DomainExclusions: def is_excluded(self, url_string): if not url_string or not isinstance(url_string, str): - return True + return True # Exclude invalid URLs - input_url = url_string.strip() + input_url_stripped = url_string.strip() - # If the url starts with www, remove it - if input_url.startswith('www.'): - input_url = input_url[4:] + try: + parsed_url = urlparse(input_url_stripped) + domain = parsed_url.netloc - for pattern in self.excluded_domains: - if pattern in input_url: - return True + # Basic check: if domain itself is empty (can happen with file:// URLs etc.) + if not domain: + return True # Exclude URLs without a domain + + # Combine domain and path for path-specific exclusions + path = parsed_url.path if parsed_url.path else '' + # Ensure path starts with / if it exists and isn't empty, handle root case + if not path.startswith('/') and path: + path = '/' + path + elif not path: + path = '/' # Represent root path explicitly for matching + domain_and_path = domain + path + + + for pattern in self.excluded_domains: + # 1. Check for path-specific patterns first (more specific) + # Use startswith for patterns like "github.com/settings" + # Ensure pattern doesn't end with '/' unless path is just '/' + if '/' in pattern: + # Normalize pattern ending for comparison + normalized_pattern = pattern.rstrip('/') + normalized_domain_path = domain_and_path.rstrip('/') + # Handle root path case explicitly + if normalized_pattern == domain and path == '/': + # print(f"DEBUG: URL '{url_string}' excluded by root path pattern '{pattern}'") + return True + if normalized_domain_path.startswith(normalized_pattern) and normalized_pattern != domain: + # print(f"DEBUG: URL '{url_string}' excluded by path pattern '{pattern}' matching '{normalized_domain_path}'") + return True + continue # Don't check domain ending if it was a path pattern + + # 2. Check if the domain ends with the pattern (handles subdomains) + # Also check for exact match. + # Example: domain "ap.www.namecheap.com" ends with pattern "namecheap.com" + # Example: domain "localhost" matches pattern "localhost" + # Add '.' prefix for endswith check to avoid partial matches like 'example.com' matching 'ample.com' + pattern_for_endswith = '.' + pattern if not pattern.startswith('.') else pattern + domain_for_endswith = '.' + domain + + if domain == pattern or domain_for_endswith.endswith(pattern_for_endswith): + # print(f"DEBUG: URL '{url_string}' excluded by domain pattern '{pattern}' matching domain '{domain}'") + return True + + # 3. Check for patterns intended to match anywhere (like "login.", ".auth.") + # This is less precise but matches the original intent of some patterns. + # Check within the domain part only. + if pattern.startswith('.') or pattern.endswith('.'): + if pattern in domain: + # print(f"DEBUG: URL '{url_string}' excluded by substring pattern '{pattern}' in domain '{domain}'") + return True + + + except ValueError: + # Handle potential errors from urlparse on malformed URLs + print(f"Warning: Could not parse URL '{url_string}' for exclusion check.") + return True # Exclude unparseable URLs + except Exception as e: + # Log other errors during URL parsing or checking + print(f"Warning: Error processing URL '{url_string}' for exclusion: {e}") + return True # Exclude URLs that cause errors during processing # If no patterns matched return False \ No newline at end of file