From ffa099a05903b5279b4ff26cbfadc81509a33ba9 Mon Sep 17 00:00:00 2001 From: Paul Lecuq Date: Mon, 29 Feb 2016 10:02:01 +0100 Subject: [PATCH] Code refactoring 3 classes : Runner, Importer, Config Improved speed using CDB Several files renamed ConfigParser object used for configuration --- .gitignore | 3 +- README.md | 18 +++-- config.py.sample | 11 --- py-sb-tool.py | 39 ++++++++++ py-squid-blacklists.conf.sample | 12 ++++ py-squid-blacklists.py | 122 ++------------------------------ 6 files changed, 65 insertions(+), 140 deletions(-) delete mode 100644 config.py.sample create mode 100755 py-sb-tool.py create mode 100644 py-squid-blacklists.conf.sample diff --git a/.gitignore b/.gitignore index ab93b0f..f0a7061 100644 --- a/.gitignore +++ b/.gitignore @@ -61,7 +61,6 @@ target/ #Ipython Notebook .ipynb_checkpoints -config.py blacklists blacklists.tar.gz -pysquidblacklists/ \ No newline at end of file +py-squid-blacklists.conf \ No newline at end of file diff --git a/README.md b/README.md index b3b3a8d..39b0926 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ Squid helper handling squidguard blacklists written in python * Only supports domains blacklists actually (ie : google.com, www.google.com, mail.google.com, etc.) * In config specified blacklists are loaded in RAM or CDB backend using https://github.com/acg/python-cdb * Usable as an external acl plugin of squid -* Written because of poor developpement on squidguard and some issues using blacklists on squid3 +* Written because of poor development on squidguard and some issues using blacklists on squid3 ## Usage @@ -17,27 +17,25 @@ acl urlblacklist external urlblacklist_lookup http_access deny urlblacklist ``` -config.py file must be include following statements +Config file must be include following statements ``` -url = "http://dsi.ut-capitole.fr/blacklists/download/blacklists.tar.gz" -base_dir = "/usr/local/py-squid-blacklists/blacklists/" -categories = ["adult","malware"] -db_backend = "ram" +url = http://dsi.ut-capitole.fr/blacklists/download/blacklists.tar.gz +base_dir = /usr/local/py-squid-blacklists/ +categories = adult,malware +db_backend = cdb ``` * url : squidguard-like blacklists files, this variable is not already usable +* base_dir : root path containing blacklists files, metadata (update datetime) * categories : blacklists to use for filtering -* base_dir : path containing blacklists files * db_backend : database flavour (ram|cdb) ## TODO -* Auto-fetcher using url if blacklists are not already downloaded or stored on the squid machine +* Auto-fetcher using url if blacklists are not already downloaded or stored on the squid machine (wip) * Compatibility with python3 only * Filters for regex urls -* Reduce memory footprint (wip with CDB backend alternative) * Code optimisation (profiling) and cleaning (wip) -* Object oriented programming (wip) * Tests (wip) * ... diff --git a/config.py.sample b/config.py.sample deleted file mode 100644 index 28e8b68..0000000 --- a/config.py.sample +++ /dev/null @@ -1,11 +0,0 @@ -# url to retrieve blacklists -url = "http://dsi.ut-capitole.fr/blacklists/download/blacklists.tar.gz" - -# categories -categories = ["adult", "malware"] - -# base directory for blacklists -base_dir = "/usr/local/py-squid-blacklists/blacklists/" - -# ram | cdb -db_backend = "cdb" \ No newline at end of file diff --git a/py-sb-tool.py b/py-sb-tool.py new file mode 100755 index 0000000..060cd84 --- /dev/null +++ b/py-sb-tool.py @@ -0,0 +1,39 @@ +#!/usr/bin/env python2.7 + +import os +import sys +import tarfile +import urllib + +from pysquidblacklists import PySquidBlacklistsConfig + +print("Parsing configuration file ...") +config = PySquidBlacklistsConfig() +config.get_config() + + +def download(url, path): + bl_file = urllib.URLopener() + bl_file.retrieve(url, path) + + +def extract(base_dir, archive): + if not os.path.isdir(base_dir): + bl_file = tarfile.open(archive) + bl_file.extractall(base_dir) + else: + pass + + +def usage(): + print("tool.py import : import blacklists using config file") + + +if len(sys.argv) > 1: + if sys.argv[1] == "import": + print("Retrieving %s, storing it to %s ..." % (config.url, config.archive)) + download(config.url, config.archive) + print("Extracting blacklists to %s" % config.base_dir) + extract(config.base_dir, config.archive) +else: + print(usage()) \ No newline at end of file diff --git a/py-squid-blacklists.conf.sample b/py-squid-blacklists.conf.sample new file mode 100644 index 0000000..0962b9e --- /dev/null +++ b/py-squid-blacklists.conf.sample @@ -0,0 +1,12 @@ +[main] +# url to retrieve blacklists +url = http://dsi.ut-capitole.fr/blacklists/download/blacklists.tar.gz + +# categories +categories = adult,malware + +# base directory for blacklist extraction +base_dir = /usr/local/py-squid-blacklists/ + +# ram | cdb +db_backend = cdb \ No newline at end of file diff --git a/py-squid-blacklists.py b/py-squid-blacklists.py index c54f60e..49794fe 100755 --- a/py-squid-blacklists.py +++ b/py-squid-blacklists.py @@ -1,121 +1,9 @@ #!/usr/bin/env python2.7 +from pysquidblacklists import * -import sys -import os -import re -from urlparse import urlparse - -try: - import config -except ImportError: - print("Please create config.py using config.py.sample") - exit() -try: - import cdb -except ImportError: - print("Please install python-cdb from pypi or via package manager") - exit() - - -class PySquidBlacklists: - def __init__(self, config, bli): - self.base_dir = config.base_dir - self.db_backend = config.db_backend - self.cache = bli.cache - self.cdb_cache = dict() - for blacklist in self.cache: - if self.db_backend == "ram": - pass - elif self.db_backend == "cdb": - self.cdb_cache[blacklist] = cdb.init(blacklist) - self.loop() - - def domain_compare(self): - result = False - for blacklist in self.cache: - tmpline = self.outline - while not result and tmpline != "": - try: - if self.db_backend == "ram": - result = self.cache[blacklist][tmpline] - elif self.db_backend == "cdb": - result = self.cdb_cache[blacklist][tmpline] - pass - except KeyError: - pass - tmpline = tmpline.partition('.')[2] - return result - - def loop(self): - while True: - try: - line = sys.stdin.readline().strip() - if line == "": - exit() - self.outline = urlparse(line).netloc - if line: - if self.domain_compare(): - self.response("OK") - else: - self.response("ERR") - except IOError: - pass - - @staticmethod - def response(r): - sys.stdout.write("%s\n" % r) - sys.stdout.flush() - - -class PySquidBlacklistsImporter: - def __init__(self, conf): - self.db_backend = config.db_backend - self.categories = config.categories - self.base_dir = config.base_dir - self.domain_files = [os.path.join(dp, f) for dp, dn, fn in os.walk(os.path.expanduser(self.base_dir)) for f in - fn if re.match(r"domains*", f)] - self.blacklist_files = self.make_list() - self.cache = None - if self.db_backend == "ram": - self.make_ram_db() - elif self.db_backend == "cdb": - self.make_cdb_db() - - def make_list(self): - blacklists = [] - for l in self.domain_files: - splitlist = l.split("/") - list_type = splitlist[len(splitlist) - 2] - blacklists.append([list_type, l]) - return blacklists - - def make_ram_db(self): - lib = dict() - for bls in self.blacklist_files: - if bls[0] in self.categories: - blcache = dict() - f = open(bls[1], "r") - for l in f: - blcache[l.strip("\n")] = True - lib[bls[0]] = blcache - del blcache - self.cache = lib - - def make_cdb_db(self): - lib = [] - for bl in self.blacklist_files: - bl_cdb_file = ("%s/%s.cdb" % (self.base_dir, bl[0])) - bl_cdb_file_tmp = ("%s/%s.tmp" % (self.base_dir, bl[0])) - if bl[0] in self.categories: - if not os.path.isfile(bl_cdb_file): - cdb_file = cdb.cdbmake(bl_cdb_file, bl_cdb_file_tmp) - f = open(bl[1], "r") - for line in f: - cdb_file.add(line.strip("\n"), "True") - cdb_file.finish() - lib.append(bl_cdb_file) - self.cache = lib - +config = PySquidBlacklistsConfig() +config.get_config() bli = PySquidBlacklistsImporter(config) -bl = PySquidBlacklists(config, bli) +bl = PySquidBlacklistsRunner(config, bli) +bl.loop() \ No newline at end of file