More domain exclusion

This commit is contained in:
2025-04-12 15:15:22 -05:00
parent 3596bb2a08
commit 382d5ee705
5 changed files with 71 additions and 16 deletions

View File

@@ -41,17 +41,74 @@ class DomainExclusions:
def is_excluded(self, url_string):
if not url_string or not isinstance(url_string, str):
return True
return True # Exclude invalid URLs
input_url = url_string.strip()
input_url_stripped = url_string.strip()
# If the url starts with www, remove it
if input_url.startswith('www.'):
input_url = input_url[4:]
try:
parsed_url = urlparse(input_url_stripped)
domain = parsed_url.netloc
for pattern in self.excluded_domains:
if pattern in input_url:
return True
# Basic check: if domain itself is empty (can happen with file:// URLs etc.)
if not domain:
return True # Exclude URLs without a domain
# Combine domain and path for path-specific exclusions
path = parsed_url.path if parsed_url.path else ''
# Ensure path starts with / if it exists and isn't empty, handle root case
if not path.startswith('/') and path:
path = '/' + path
elif not path:
path = '/' # Represent root path explicitly for matching
domain_and_path = domain + path
for pattern in self.excluded_domains:
# 1. Check for path-specific patterns first (more specific)
# Use startswith for patterns like "github.com/settings"
# Ensure pattern doesn't end with '/' unless path is just '/'
if '/' in pattern:
# Normalize pattern ending for comparison
normalized_pattern = pattern.rstrip('/')
normalized_domain_path = domain_and_path.rstrip('/')
# Handle root path case explicitly
if normalized_pattern == domain and path == '/':
# print(f"DEBUG: URL '{url_string}' excluded by root path pattern '{pattern}'")
return True
if normalized_domain_path.startswith(normalized_pattern) and normalized_pattern != domain:
# print(f"DEBUG: URL '{url_string}' excluded by path pattern '{pattern}' matching '{normalized_domain_path}'")
return True
continue # Don't check domain ending if it was a path pattern
# 2. Check if the domain ends with the pattern (handles subdomains)
# Also check for exact match.
# Example: domain "ap.www.namecheap.com" ends with pattern "namecheap.com"
# Example: domain "localhost" matches pattern "localhost"
# Add '.' prefix for endswith check to avoid partial matches like 'example.com' matching 'ample.com'
pattern_for_endswith = '.' + pattern if not pattern.startswith('.') else pattern
domain_for_endswith = '.' + domain
if domain == pattern or domain_for_endswith.endswith(pattern_for_endswith):
# print(f"DEBUG: URL '{url_string}' excluded by domain pattern '{pattern}' matching domain '{domain}'")
return True
# 3. Check for patterns intended to match anywhere (like "login.", ".auth.")
# This is less precise but matches the original intent of some patterns.
# Check within the domain part only.
if pattern.startswith('.') or pattern.endswith('.'):
if pattern in domain:
# print(f"DEBUG: URL '{url_string}' excluded by substring pattern '{pattern}' in domain '{domain}'")
return True
except ValueError:
# Handle potential errors from urlparse on malformed URLs
print(f"Warning: Could not parse URL '{url_string}' for exclusion check.")
return True # Exclude unparseable URLs
except Exception as e:
# Log other errors during URL parsing or checking
print(f"Warning: Error processing URL '{url_string}' for exclusion: {e}")
return True # Exclude URLs that cause errors during processing
# If no patterns matched
return False