Fix exclusion

This commit is contained in:
2025-04-12 15:03:58 -05:00
parent f723a56090
commit 832dde2ae0
3 changed files with 40 additions and 39 deletions

2
.env
View File

@@ -1 +1 @@
CRAWL_INTERVAL=10 CRAWL_INTERVAL=5

View File

@@ -3,15 +3,14 @@ excluded_domains:
- "identity.getpostman.com" - "identity.getpostman.com"
- "localhost" - "localhost"
- "127.0.0.1" - "127.0.0.1"
- "*.namecheap.com*" - "namecheap.com"
- "us.ovhcloud.com*" - "us.ovhcloud.com"
- "*.linode.com*" - "linode.com"
- "github.com/settings/*" - "github.com/settings/"
- "*.secure.backblaze.com*" - "secure.backblaze.com"
- "*.login.*" - "login."
- "*.identity.*" - "identity."
- "*.auth.*" - ".auth."
- "192.168.*.*" - "192.168."
- "10.*.*.*" - "172.16."
- "172.16.*.*" - "0.0.0."
- "0.0.0.*"

View File

@@ -1,5 +1,6 @@
import yaml import yaml
from fnmatch import fnmatch from fnmatch import fnmatch
from urllib.parse import urlparse
class DomainExclusions: class DomainExclusions:
def __init__(self, config_path="config/history_config.yaml"): def __init__(self, config_path="config/history_config.yaml"):
@@ -14,42 +15,43 @@ class DomainExclusions:
# Handle both direct list and dict with 'excluded_domains' key # Handle both direct list and dict with 'excluded_domains' key
if isinstance(config, list): if isinstance(config, list):
self.excluded_domains = config loaded_patterns = config
elif isinstance(config, dict):
loaded_patterns = config.get('excluded_domains', [])
else: else:
self.excluded_domains = config.get('excluded_domains', []) loaded_patterns = [] # Handle other invalid config types
# Basic validation/cleaning of patterns
self.excluded_domains = [
str(p).strip() for p in loaded_patterns if p and isinstance(p, str)
]
# Optional: Warn if some patterns were ignored
# if len(self.excluded_domains) != len(loaded_patterns):
# print(f"Warning: Some invalid patterns were ignored in {config_path}")
except FileNotFoundError: except FileNotFoundError:
print(f"Warning: Configuration file {config_path} not found. No domains will be excluded.") print(f"Warning: Configuration file {config_path} not found. No domains will be excluded.")
self.excluded_domains = [] # Ensure it's empty on error
except yaml.YAMLError as e: except yaml.YAMLError as e:
print(f"Error parsing YAML configuration: {e}") print(f"Error parsing YAML configuration: {e}")
self.excluded_domains = [] self.excluded_domains = []
except Exception as e: # Catch other potential errors
print(f"An unexpected error occurred during config loading: {e}")
self.excluded_domains = []
def is_excluded(self, domain): def is_excluded(self, url_string):
""" if not url_string or not isinstance(url_string, str):
Check if a domain matches any of the excluded domain patterns. return True
"""
# Strip protocol (http:// or https://) if present
domain = domain.lower().strip('/')
if '://' in domain:
domain = domain.split('://', 1)[1]
# Strip query parameters if present input_url = url_string.strip()
if '?' in domain:
domain = domain.split('?', 1)[0]
# Split domain and path # If the url starts with www, remove it
if '/' in domain: if input_url.startswith('www.'):
domain = domain.split('/', 1)[0] input_url = input_url[4:]
for pattern in self.excluded_domains: for pattern in self.excluded_domains:
pattern = pattern.lower().strip('/') if pattern in input_url:
if '/' in pattern:
pattern = pattern.split('/', 1)[0]
# Remove trailing wildcard if present
if pattern.endswith('*'):
pattern = pattern.rstrip('*').rstrip('.')
# Use fnmatch for proper wildcard pattern matching
if fnmatch(domain, pattern):
return True return True
# If no patterns matched
return False return False