More domain exclusion

2026-03-15 18:49:04 +00:00 · 2025-04-12 15:15:22 -05:00
parent 3596bb2a08
commit 382d5ee705
5 changed files with 71 additions and 16 deletions
--- a/.env
+++ b/.env
@@ -1 +1 @@
-CRAWL_INTERVAL=5
+CRAWL_INTERVAL=3
--- a/config/history_config.yaml
+++ b/config/history_config.yaml
@@ -6,7 +6,7 @@ excluded_domains:
  - "namecheap.com"
  - "us.ovhcloud.com"
  - "linode.com"
-  - "github.com/settings/"
+  - "github.com/settings"
  - "secure.backblaze.com"
  - "login."
  - "identity."
--- a/run-browser-recall.fish
+++ b/run-browser-recall.fish
@@ -8,4 +8,4 @@ source (dirname (status filename))/.venv/bin/activate.fish
 python src/main.py > /dev/null 2>&1 &

 # Print a simple confirmation message
-echo "Browser Recall started in background"
+echo "Browser Recall started in background with PID $!"
--- a/src/base_crawler.py
+++ b/src/base_crawler.py
@@ -24,11 +24,9 @@ class BaseCrawler:
        if url.startswith("about:") or url.startswith("chrome:"):
            return True, "Browser internal URL"

-        domain = urlparse(url).netloc
-
-        # Check domain exclusions
-        if self.domain_exclusions.is_excluded(domain):
-            return True, "Excluded domain"
+        # Check domain exclusions using the full URL, not just the domain
+        if self.domain_exclusions.is_excluded(url):
+            return True, "Excluded domain/path"

        # Check if URL exists
        if self.db.url_exists(url):
--- a/src/domain_exclusions.py
+++ b/src/domain_exclusions.py
@@ -41,17 +41,74 @@ class DomainExclusions:

    def is_excluded(self, url_string):
        if not url_string or not isinstance(url_string, str):
-            return True
+            return True # Exclude invalid URLs

-        input_url = url_string.strip()
+        input_url_stripped = url_string.strip()

-        # If the url starts with www, remove it
-        if input_url.startswith('www.'):
-            input_url = input_url[4:]
+        try:
+            parsed_url = urlparse(input_url_stripped)
+            domain = parsed_url.netloc

-        for pattern in self.excluded_domains:
-            if pattern in input_url:
-                return True
+            # Basic check: if domain itself is empty (can happen with file:// URLs etc.)
+            if not domain:
+                return True # Exclude URLs without a domain
+
+            # Combine domain and path for path-specific exclusions
+            path = parsed_url.path if parsed_url.path else ''
+            # Ensure path starts with / if it exists and isn't empty, handle root case
+            if not path.startswith('/') and path:
+                 path = '/' + path
+            elif not path:
+                 path = '/' # Represent root path explicitly for matching
+            domain_and_path = domain + path
+
+
+            for pattern in self.excluded_domains:
+                # 1. Check for path-specific patterns first (more specific)
+                #    Use startswith for patterns like "github.com/settings"
+                #    Ensure pattern doesn't end with '/' unless path is just '/'
+                if '/' in pattern:
+                     # Normalize pattern ending for comparison
+                     normalized_pattern = pattern.rstrip('/')
+                     normalized_domain_path = domain_and_path.rstrip('/')
+                     # Handle root path case explicitly
+                     if normalized_pattern == domain and path == '/':
+                         # print(f"DEBUG: URL '{url_string}' excluded by root path pattern '{pattern}'")
+                         return True
+                     if normalized_domain_path.startswith(normalized_pattern) and normalized_pattern != domain:
+                         # print(f"DEBUG: URL '{url_string}' excluded by path pattern '{pattern}' matching '{normalized_domain_path}'")
+                         return True
+                     continue # Don't check domain ending if it was a path pattern
+
+                # 2. Check if the domain ends with the pattern (handles subdomains)
+                #    Also check for exact match.
+                #    Example: domain "ap.www.namecheap.com" ends with pattern "namecheap.com"
+                #    Example: domain "localhost" matches pattern "localhost"
+                #    Add '.' prefix for endswith check to avoid partial matches like 'example.com' matching 'ample.com'
+                pattern_for_endswith = '.' + pattern if not pattern.startswith('.') else pattern
+                domain_for_endswith = '.' + domain
+
+                if domain == pattern or domain_for_endswith.endswith(pattern_for_endswith):
+                    # print(f"DEBUG: URL '{url_string}' excluded by domain pattern '{pattern}' matching domain '{domain}'")
+                    return True
+
+                # 3. Check for patterns intended to match anywhere (like "login.", ".auth.")
+                #    This is less precise but matches the original intent of some patterns.
+                #    Check within the domain part only.
+                if pattern.startswith('.') or pattern.endswith('.'):
+                    if pattern in domain:
+                         # print(f"DEBUG: URL '{url_string}' excluded by substring pattern '{pattern}' in domain '{domain}'")
+                         return True
+
+
+        except ValueError:
+             # Handle potential errors from urlparse on malformed URLs
+             print(f"Warning: Could not parse URL '{url_string}' for exclusion check.")
+             return True # Exclude unparseable URLs
+        except Exception as e:
+            # Log other errors during URL parsing or checking
+            print(f"Warning: Error processing URL '{url_string}' for exclusion: {e}")
+            return True # Exclude URLs that cause errors during processing

        # If no patterns matched
        return False