From b7299fdb1782b80b37a637f8e010417223989abe Mon Sep 17 00:00:00 2001 From: Paul Lecuq Date: Fri, 26 Feb 2016 15:25:17 +0100 Subject: [PATCH] Code refactoring Importer class provisionned Improved speed using CDB --- config.py.sample | 2 +- py-squid-blacklists.py | 127 +++++++++++++++++++++-------------------- 2 files changed, 66 insertions(+), 63 deletions(-) diff --git a/config.py.sample b/config.py.sample index 984ca7e..28e8b68 100644 --- a/config.py.sample +++ b/config.py.sample @@ -2,7 +2,7 @@ url = "http://dsi.ut-capitole.fr/blacklists/download/blacklists.tar.gz" # categories -categories = ["adult","malware"] +categories = ["adult", "malware"] # base directory for blacklists base_dir = "/usr/local/py-squid-blacklists/blacklists/" diff --git a/py-squid-blacklists.py b/py-squid-blacklists.py index 68f5447..c54f60e 100755 --- a/py-squid-blacklists.py +++ b/py-squid-blacklists.py @@ -18,70 +18,28 @@ except ImportError: class PySquidBlacklists: - def __init__(self, config): - self.db_backend = config.db_backend - self.categories = config.categories + def __init__(self, config, bli): self.base_dir = config.base_dir - self.domain_files = [os.path.join(dp, f) for dp, dn, fn in os.walk(os.path.expanduser(self.base_dir)) for f in - fn if re.match(r"domains*", f)] - self.blacklist_files = self.make_list() - if self.db_backend == "ram": - self.cache = self.make_ram_db() - elif self.db_backend == "cdb": - self.cache = self.make_cdb_db() - self.loop() - - def make_list(self): - blacklists = [] - for l in self.domain_files: - splitlist = l.split("/") - list_type = splitlist[len(splitlist) - 2] - blacklists.append([list_type, l]) - return blacklists - - def make_ram_db(self): - lib = dict() - for bls in self.blacklist_files: - if self.db_backend == "ram": - if bls[0] in self.categories: - cache = dict() - f = open(bls[1], "r") - for l in f: - cache[l.strip("\n")] = True - lib[bls[0]] = cache - del cache - return lib - - def make_cdb_db(self): - lib = [] - for bl in self.blacklist_files: - bl_cdb_file = ("%s/%s.cdb" % (self.base_dir, bl[0])) - bl_cdb_file_tmp = ("%s/%s.tmp" % (self.base_dir, bl[0])) - if (bl[0] in self.categories): - if not os.path.isfile(bl_cdb_file): - cdb_file = cdb.cdbmake(bl_cdb_file, bl_cdb_file_tmp) - f = open(bl[1], "r") - for line in f: - cdb_file.add(line.strip("\n"), "True") - cdb_file.finish() - lib.append(bl_cdb_file) - return lib - - def domain_compare(self, outline): - global cdb_file - result = False + self.db_backend = config.db_backend + self.cache = bli.cache + self.cdb_cache = dict() for blacklist in self.cache: - tmpline = outline if self.db_backend == "ram": pass elif self.db_backend == "cdb": - cdb_file = cdb.init(blacklist) + self.cdb_cache[blacklist] = cdb.init(blacklist) + self.loop() + + def domain_compare(self): + result = False + for blacklist in self.cache: + tmpline = self.outline while not result and tmpline != "": try: if self.db_backend == "ram": result = self.cache[blacklist][tmpline] elif self.db_backend == "cdb": - result = cdb_file[tmpline] + result = self.cdb_cache[blacklist][tmpline] pass except KeyError: pass @@ -94,12 +52,12 @@ class PySquidBlacklists: line = sys.stdin.readline().strip() if line == "": exit() - outline = urlparse(line).netloc + self.outline = urlparse(line).netloc if line: - if self.domain_compare(outline): - self.response("OK") - else: - self.response("ERR") + if self.domain_compare(): + self.response("OK") + else: + self.response("ERR") except IOError: pass @@ -111,8 +69,53 @@ class PySquidBlacklists: class PySquidBlacklistsImporter: def __init__(self, conf): - self.test = True - self.db = conf.db_backend + self.db_backend = config.db_backend + self.categories = config.categories + self.base_dir = config.base_dir + self.domain_files = [os.path.join(dp, f) for dp, dn, fn in os.walk(os.path.expanduser(self.base_dir)) for f in + fn if re.match(r"domains*", f)] + self.blacklist_files = self.make_list() + self.cache = None + if self.db_backend == "ram": + self.make_ram_db() + elif self.db_backend == "cdb": + self.make_cdb_db() + + def make_list(self): + blacklists = [] + for l in self.domain_files: + splitlist = l.split("/") + list_type = splitlist[len(splitlist) - 2] + blacklists.append([list_type, l]) + return blacklists + + def make_ram_db(self): + lib = dict() + for bls in self.blacklist_files: + if bls[0] in self.categories: + blcache = dict() + f = open(bls[1], "r") + for l in f: + blcache[l.strip("\n")] = True + lib[bls[0]] = blcache + del blcache + self.cache = lib + + def make_cdb_db(self): + lib = [] + for bl in self.blacklist_files: + bl_cdb_file = ("%s/%s.cdb" % (self.base_dir, bl[0])) + bl_cdb_file_tmp = ("%s/%s.tmp" % (self.base_dir, bl[0])) + if bl[0] in self.categories: + if not os.path.isfile(bl_cdb_file): + cdb_file = cdb.cdbmake(bl_cdb_file, bl_cdb_file_tmp) + f = open(bl[1], "r") + for line in f: + cdb_file.add(line.strip("\n"), "True") + cdb_file.finish() + lib.append(bl_cdb_file) + self.cache = lib -bl = PySquidBlacklists(config) +bli = PySquidBlacklistsImporter(config) +bl = PySquidBlacklists(config, bli)