diff --git a/.env b/.env index 063eceb..970177f 100644 --- a/.env +++ b/.env @@ -1 +1 @@ -CRAWL_INTERVAL=10 \ No newline at end of file +CRAWL_INTERVAL=5 \ No newline at end of file diff --git a/config/history_config.yaml b/config/history_config.yaml index d9a9012..cfb52a3 100644 --- a/config/history_config.yaml +++ b/config/history_config.yaml @@ -3,15 +3,14 @@ excluded_domains: - "identity.getpostman.com" - "localhost" - "127.0.0.1" - - "*.namecheap.com*" - - "us.ovhcloud.com*" - - "*.linode.com*" - - "github.com/settings/*" - - "*.secure.backblaze.com*" - - "*.login.*" - - "*.identity.*" - - "*.auth.*" - - "192.168.*.*" - - "10.*.*.*" - - "172.16.*.*" - - "0.0.0.*" + - "namecheap.com" + - "us.ovhcloud.com" + - "linode.com" + - "github.com/settings/" + - "secure.backblaze.com" + - "login." + - "identity." + - ".auth." + - "192.168." + - "172.16." + - "0.0.0." diff --git a/src/domain_exclusions.py b/src/domain_exclusions.py index 20e7490..d223986 100644 --- a/src/domain_exclusions.py +++ b/src/domain_exclusions.py @@ -1,5 +1,6 @@ import yaml from fnmatch import fnmatch +from urllib.parse import urlparse class DomainExclusions: def __init__(self, config_path="config/history_config.yaml"): @@ -14,42 +15,43 @@ class DomainExclusions: # Handle both direct list and dict with 'excluded_domains' key if isinstance(config, list): - self.excluded_domains = config + loaded_patterns = config + elif isinstance(config, dict): + loaded_patterns = config.get('excluded_domains', []) else: - self.excluded_domains = config.get('excluded_domains', []) + loaded_patterns = [] # Handle other invalid config types + + # Basic validation/cleaning of patterns + self.excluded_domains = [ + str(p).strip() for p in loaded_patterns if p and isinstance(p, str) + ] + # Optional: Warn if some patterns were ignored + # if len(self.excluded_domains) != len(loaded_patterns): + # print(f"Warning: Some invalid patterns were ignored in {config_path}") + except FileNotFoundError: print(f"Warning: Configuration file {config_path} not found. No domains will be excluded.") + self.excluded_domains = [] # Ensure it's empty on error except yaml.YAMLError as e: print(f"Error parsing YAML configuration: {e}") self.excluded_domains = [] + except Exception as e: # Catch other potential errors + print(f"An unexpected error occurred during config loading: {e}") + self.excluded_domains = [] - def is_excluded(self, domain): - """ - Check if a domain matches any of the excluded domain patterns. - """ - # Strip protocol (http:// or https://) if present - domain = domain.lower().strip('/') - if '://' in domain: - domain = domain.split('://', 1)[1] + def is_excluded(self, url_string): + if not url_string or not isinstance(url_string, str): + return True - # Strip query parameters if present - if '?' in domain: - domain = domain.split('?', 1)[0] + input_url = url_string.strip() - # Split domain and path - if '/' in domain: - domain = domain.split('/', 1)[0] + # If the url starts with www, remove it + if input_url.startswith('www.'): + input_url = input_url[4:] for pattern in self.excluded_domains: - pattern = pattern.lower().strip('/') - if '/' in pattern: - pattern = pattern.split('/', 1)[0] - - # Remove trailing wildcard if present - if pattern.endswith('*'): - pattern = pattern.rstrip('*').rstrip('.') - - # Use fnmatch for proper wildcard pattern matching - if fnmatch(domain, pattern): + if pattern in input_url: return True + + # If no patterns matched return False \ No newline at end of file