diff --git a/.gitignore b/.gitignore index 10e1ba6..47aa7e7 100644 --- a/.gitignore +++ b/.gitignore @@ -65,6 +65,4 @@ target/ blacklists blacklists.tar.gz py-squid-blacklists.conf - -# Others -.idea/ \ No newline at end of file +pybl.conf \ No newline at end of file diff --git a/LICENSE b/LICENSE deleted file mode 100644 index d2c1ac7..0000000 --- a/LICENSE +++ /dev/null @@ -1,23 +0,0 @@ -Copyright (c) 2016, Paul Lecuq -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - -* Redistributions of source code must retain the above copyright notice, this - list of conditions and the following disclaimer. - -* Redistributions in binary form must reproduce the above copyright notice, - this list of conditions and the following disclaimer in the documentation - and/or other materials provided with the distribution. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/README.md b/README.md index 04625b5..bc82494 100644 --- a/README.md +++ b/README.md @@ -1,16 +1,19 @@ -# py-squid-blacklists +# pybl + Squid helper handling squidguard blacklists written in python * Only supports domains blacklists actually (ie : google.com, www.google.com, mail.google.com, etc.) -* In config specified blacklists are loaded in RAM or CDB backend using https://github.com/acg/python-cdb -* Usable as an external acl plugin of squid -* Written because of poor development on squidguard and some issues using blacklists on squid3 +* In config specified blacklists are loaded in memory or CDB backend using https://github.com/bbayles/python-pure-cdb +* Usable as an external acl plugin for squid 3 +* Written because of poor development on squidguard and some issues using blacklists on squid +* Python 3 supported as of 2020 + ## Usage Add this configuration to squid.conf : ``` -external_acl_type urlblacklist_lookup ttl=5 %URI /usr/bin/python /usr/local/py-squid-blacklists/py-squid-blacklists.py +external_acl_type urlblacklist_lookup ttl=5 %DST /usr/bin/python /usr/local/pybl/pybl.py ... acl urlblacklist external urlblacklist_lookup ... @@ -20,15 +23,16 @@ http_access deny urlblacklist Config file must be include following statements ``` url = http://dsi.ut-capitole.fr/blacklists/download/blacklists.tar.gz -base_dir = /usr/local/py-squid-blacklists/ -categories = adult,malware -db_backend = cdb +basedir = /usr/local/pybl/ +categories = adult,malware # categories are coma separated values +backend = cdb ``` * url : squidguard-like blacklists files, this variable is not already usable -* base_dir : root path containing blacklists files, metadata (update datetime) +* basedir : root path containing blacklists files, metadata (update datetime) * categories : blacklists to use for filtering -* db_backend : database flavour (ram|cdb) +* backend : database flavour (ram|cdb) + ## TODO @@ -39,6 +43,7 @@ db_backend = cdb * Tests (wip) * ... + ## DBs support ideas * High performance but heavy RAM usage when using dict() @@ -46,7 +51,7 @@ db_backend = cdb * CDB backend seems to be as fast as attended, with a very small footprint -## DBs Benchmarks +## DBs Benchmarks (2016) RAM usage for one thread with categories ["adult","malware"] @@ -54,3 +59,35 @@ Debian 8 / python 2.7.9 / squid 3.4.8 * ram : 90Mo * cdb : 6Mo + +Ubuntu 20.04 / python 3.8.2 / squid 4.9 + +* ram : 249Mo +* cdb : 12Mo + + +## License + +Copyright (c) 2016, 2020 PaulBSD +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/py-sb-tool.py b/py-sb-tool.py deleted file mode 100755 index 060cd84..0000000 --- a/py-sb-tool.py +++ /dev/null @@ -1,39 +0,0 @@ -#!/usr/bin/env python2.7 - -import os -import sys -import tarfile -import urllib - -from pysquidblacklists import PySquidBlacklistsConfig - -print("Parsing configuration file ...") -config = PySquidBlacklistsConfig() -config.get_config() - - -def download(url, path): - bl_file = urllib.URLopener() - bl_file.retrieve(url, path) - - -def extract(base_dir, archive): - if not os.path.isdir(base_dir): - bl_file = tarfile.open(archive) - bl_file.extractall(base_dir) - else: - pass - - -def usage(): - print("tool.py import : import blacklists using config file") - - -if len(sys.argv) > 1: - if sys.argv[1] == "import": - print("Retrieving %s, storing it to %s ..." % (config.url, config.archive)) - download(config.url, config.archive) - print("Extracting blacklists to %s" % config.base_dir) - extract(config.base_dir, config.archive) -else: - print(usage()) \ No newline at end of file diff --git a/py-squid-blacklists.py b/py-squid-blacklists.py deleted file mode 100755 index 43dfb74..0000000 --- a/py-squid-blacklists.py +++ /dev/null @@ -1,9 +0,0 @@ -#!/usr/bin/env python2.7 -from pysquidblacklists import * - -config = PySquidBlacklistsConfig() -config.get_config(__file__) - -bli = PySquidBlacklistsImporter(config) -bl = PySquidBlacklistsRunner(config, bli) -bl.loop() \ No newline at end of file diff --git a/pybl-cmd.py b/pybl-cmd.py new file mode 100755 index 0000000..d074c80 --- /dev/null +++ b/pybl-cmd.py @@ -0,0 +1,12 @@ +#!/usr/bin/env python3 + +from pybl import * + + +if __name__ == "__main__": + config = PyBLConfig() + config.getconfig(__file__) + + bli = PyBLImporter(config) + bl = PyBLRunner(config, bli) + bl.loop() diff --git a/pybl-tool.py b/pybl-tool.py new file mode 100755 index 0000000..01b89ce --- /dev/null +++ b/pybl-tool.py @@ -0,0 +1,48 @@ +#!/usr/bin/env python3 + +from pybl import PyBLConfig + +import os +import sys +import tarfile +import urllib.request +import argparse + + +def download(url, path): + try: + print("Retrieving {}, storing it to {} ...".format(config.url, config.archive)) + filename, _ = urllib.request.urlretrieve(url, filename=path) + print("{} Successfully downloaded".format(filename)) + except Exception as exp: + print(exp) + exit(1) + + +def folder_exists(basedir): + ret = os.path.isdir(basedir) + print("Destination folder {} already exists".format(basedir)) + return ret + + +def extract(basedir, archive): + print("Extracting blacklists to {} ...".format(config.basedir)) + bl_file = tarfile.open(archive) + bl_file.extractall(basedir) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description='Action') + parser.add_argument('--config', metavar='config', type=str, nargs=1, default='pybl.conf', help='path to config file') + parser.add_argument('--force', action="store_true", help='force download and extract archive') + parser.add_argument('action', metavar='action', type=str, help='action blacklists using config file', choices=["import"]) + args = parser.parse_args() + + if args.action == "import": + config = PyBLConfig() + print("Parsing configuration file ...") + config.getconfig(args.config[0]) + exists = folder_exists(config.basedir) + if not exists or args.force: + download(config.url, config.archive) + extract(config.basedir, config.archive) diff --git a/py-squid-blacklists.conf.sample b/pybl.conf.sample similarity index 77% rename from py-squid-blacklists.conf.sample rename to pybl.conf.sample index 0962b9e..f412f83 100644 --- a/py-squid-blacklists.conf.sample +++ b/pybl.conf.sample @@ -6,7 +6,7 @@ url = http://dsi.ut-capitole.fr/blacklists/download/blacklists.tar.gz categories = adult,malware # base directory for blacklist extraction -base_dir = /usr/local/py-squid-blacklists/ +basedir = /usr/local/py-squid-blacklists/ # ram | cdb -db_backend = cdb \ No newline at end of file +backend = cdb \ No newline at end of file diff --git a/pybl/PyBLConfig.py b/pybl/PyBLConfig.py new file mode 100644 index 0000000..33c0969 --- /dev/null +++ b/pybl/PyBLConfig.py @@ -0,0 +1,47 @@ +import configparser +import os +import sys + +class PyBLConfig: + + def __init__(self): + self.default_filename = "pybl.conf" + self.default_config_path = "/etc/{}".format(self.default_filename) + self.url = None + self.filename = None + self.basedir = "" + self.blacklistsdir = "" + self.archive = "" + self.backend = "" + self.categories = None + self.config = configparser.RawConfigParser() + self.config_path = None + + + def getconfig(self, pwd): + self.getpath(pwd) + self.config.read(self.config_path) + self.url = str(self.config.get("main", "url")) + self.filename = self.url.split("/").pop() + self.basedir = str(self.config.get("main", "basedir")) + self.blacklistsdir = "{}/blacklists".format(self.basedir) + self.archive = "{}/{}".format("/tmp", self.filename) + self.backend = str(self.config.get("main", "backend")) + self.categories = [] + for cat in self.config.get("main", "categories").split(","): + self.categories.append(str(cat)) + + + def setconfig(self, section, attr): + self.config.set(section, attr) + + + def getpath(self, pwd): + config_path = "{}/{}".format(os.path.dirname(os.path.abspath(pwd)), self.default_filename) + + if os.path.exists(config_path): + self.config_path = config_path + elif os.path.exists(self.default_config_path): + self.config_path = self.default_config_path + else: + sys.exit("No config file available at common paths (current dir or /etc). Must initialize it") diff --git a/pybl/PyBLImporter.py b/pybl/PyBLImporter.py new file mode 100644 index 0000000..9fc44d2 --- /dev/null +++ b/pybl/PyBLImporter.py @@ -0,0 +1,71 @@ +import os +import re +import cdblib.compat as cdb + + +class PyBLImporter: + + def __init__(self, config): + """Importer initializer""" + self.backend = config.backend + self.categories = config.categories + self.basedir = config.basedir + self.blacklistsdir = config.blacklistsdir + if os.path.isdir(self.basedir): + self.domain_files = [os.path.join(dp, f) for dp, dn, fn in os.walk(os.path.expanduser(self.blacklistsdir)) for f in fn if re.match(r"domains$", f)] + else: + exit("blacklistsdir doesn't exists. Please update using pybl-tool.py") + self.blacklist_files = self.makelists() + self.cache = None + if self.backend == "ram": + self.makeram() + elif self.backend == "cdb": + self.makecdb() + + + def makelists(self): + """Create blacklists of domains""" + blacklists = [] + for l in self.domain_files: + splitlist = l.split("/") + list_type = splitlist[len(splitlist) - 2] + blacklists.append([list_type, l]) + return blacklists + + + def makeram(self): + """Make dict based in-memory database""" + lib = dict() + for bls in self.blacklist_files: + cat = bls[0] + if cat in self.categories: + blc = dict() + f = open(bls[1], 'r') + for l in f: + blc[l.strip("\n")] = True + lib[cat] = blc + del blc + self.cache = lib + + + def makecdb(self): + """Make CDB database""" + lib = [] + for bl in self.blacklist_files: + bl_cdb_file = "{}/{}.cdb".format(self.basedir, bl[0]) + bl_cdb_file_tmp = "{}/{}.tmp".format(self.basedir, bl[0]) + if bl[0] in self.categories: + if not os.path.isfile(bl_cdb_file): + cdb_file = cdb.cdbmake(bl_cdb_file, bl_cdb_file_tmp) + f = open(bl[1], "r") + for l in f: + cdb_file.add(l.strip("\n"), "True") + cdb_file.finish() + lib.append(bl_cdb_file) + self.cache = lib + + + def makepickle(self): + """Support for key value based PickleDB, not yet implemented""" + lib = [] + return lib \ No newline at end of file diff --git a/pybl/PyBLRunner.py b/pybl/PyBLRunner.py new file mode 100644 index 0000000..ff85e85 --- /dev/null +++ b/pybl/PyBLRunner.py @@ -0,0 +1,72 @@ +from urllib.parse import urlparse + +import sys +import logging +import logging.handlers +import cdblib.compat as cdb + + +class PyBLRunner: + + def __init__(self, config, bli): + self.basedir = config.basedir + self.backend = config.backend + self.cache = bli.cache + self.cdb_cache = dict() + if self.backend == "ram": + pass + elif self.backend == "cdb": + for blacklist in self.cache: + self.cdb_cache[blacklist] = cdb.init(blacklist) + self.loop() + + + def domaincompare(self, inputstring): + result = False + for blacklist in self.cache: + tmpline = inputstring + while not result and tmpline != "": + try: + if self.backend == "ram": + result = self.cache[blacklist][tmpline] + elif self.backend == "cdb": + result = self.cdb_cache[blacklist][tmpline] + pass + except KeyError: + pass + tmpline = tmpline.partition('.')[2] + return result + + + def getfqdn(self, url): + # newurl = urlparse(url) + # return newurl.netloc.rsplit(" -")[0] + return url.rsplit(" -")[0] + + + def loop(self): + while True: + my_logger = logging.getLogger('MyLogger') + my_logger.setLevel(logging.DEBUG) + handler = logging.handlers.SysLogHandler(address = '/dev/log') + my_logger.addHandler(handler) + + try: + line = sys.stdin.readline().strip() + fqdn = self.getfqdn(line) + my_logger.critical("|{}|".format(fqdn)) + if line == "": + exit() + if line: + if self.domaincompare(fqdn): + self.response("OK log='{} OK'".format(fqdn)) + else: + self.response("ERR log='{} ERR'".format(fqdn)) + except IOError: + pass + + + @staticmethod + def response(r): + sys.stdout.write("{}\n".format(r)) + sys.stdout.flush() diff --git a/pybl/__init__.py b/pybl/__init__.py new file mode 100644 index 0000000..97eb9ee --- /dev/null +++ b/pybl/__init__.py @@ -0,0 +1,3 @@ +from .PyBLRunner import PyBLRunner +from .PyBLImporter import PyBLImporter +from .PyBLConfig import PyBLConfig \ No newline at end of file diff --git a/pysquidblacklists/__init__.py b/pysquidblacklists/__init__.py deleted file mode 100644 index c677702..0000000 --- a/pysquidblacklists/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from pysquidblacklists import PySquidBlacklistsRunner -from pysquidblacklists import PySquidBlacklistsImporter -from pysquidblacklists import PySquidBlacklistsConfig diff --git a/pysquidblacklists/pysquidblacklists.py b/pysquidblacklists/pysquidblacklists.py deleted file mode 100644 index e385072..0000000 --- a/pysquidblacklists/pysquidblacklists.py +++ /dev/null @@ -1,151 +0,0 @@ -import os -import re -import sys -import configparser -import cdb -from urlparse import urlparse - - -class PySquidBlacklistsRunner: - def __init__(self, config, bli): - self.base_dir = config.base_dir - self.db_backend = config.db_backend - self.cache = bli.cache - self.cdb_cache = dict() - if self.db_backend == "ram": - pass - elif self.db_backend == "cdb": - for blacklist in self.cache: - self.cdb_cache[blacklist] = cdb.init(blacklist) - self.loop() - - def domain_compare(self): - result = False - for blacklist in self.cache: - tmpline = self.outline - while not result and tmpline != "": - try: - if self.db_backend == "ram": - result = self.cache[blacklist][tmpline] - elif self.db_backend == "cdb": - result = self.cdb_cache[blacklist][tmpline] - pass - except KeyError: - pass - tmpline = tmpline.partition('.')[2] - return result - - def loop(self): - while True: - try: - line = sys.stdin.readline().strip() - if line == "": - exit() - self.outline = urlparse(line).netloc - if line: - if self.domain_compare(): - self.response("OK") - else: - self.response("ERR") - except IOError: - pass - - @staticmethod - def response(r): - sys.stdout.write("%s\n" % r) - sys.stdout.flush() - - -class PySquidBlacklistsImporter: - def __init__(self, config): - self.db_backend = config.db_backend - self.categories = config.categories - self.base_dir = config.base_dir - self.blacklists_dir = config.blacklists_dir - if os.path.isdir(self.base_dir): - self.domain_files = [os.path.join(dp, f) for dp, dn, fn in os.walk(os.path.expanduser(self.blacklists_dir)) for f in - fn if re.match(r"domains*", f)] - else: - exit("blacklists_dir doesn't exists. Please update using py-sb-tool.py") - self.blacklist_files = self.make_list() - self.cache = None - if self.db_backend == "ram": - self.make_ram_db() - elif self.db_backend == "cdb": - self.make_cdb_db() - - def make_list(self): - blacklists = [] - for l in self.domain_files: - splitlist = l.split("/") - list_type = splitlist[len(splitlist) - 2] - blacklists.append([list_type, l]) - return blacklists - - def make_ram_db(self): - lib = dict() - for bls in self.blacklist_files: - if bls[0] in self.categories: - blcache = dict() - f = open(bls[1], "r") - for l in f: - blcache[l.strip("\n")] = True - lib[bls[0]] = blcache - del blcache - self.cache = lib - - def make_cdb_db(self): - lib = [] - for bl in self.blacklist_files: - bl_cdb_file = ("%s/%s.cdb" % (self.base_dir, bl[0])) - bl_cdb_file_tmp = ("%s/%s.tmp" % (self.base_dir, bl[0])) - if bl[0] in self.categories: - if not os.path.isfile(bl_cdb_file): - cdb_file = cdb.cdbmake(bl_cdb_file, bl_cdb_file_tmp) - f = open(bl[1], "r") - for line in f: - cdb_file.add(line.strip("\n"), "True") - cdb_file.finish() - lib.append(bl_cdb_file) - self.cache = lib - - -class PySquidBlacklistsConfig: - def __init__(self): - self.url = None - self.filename = None - self.base_dir = None - self.blacklists_dir = None - self.archive = None - self.db_backend = None - self.categories = None - self.config = configparser.RawConfigParser() - self.config_path = None - - def get_config(self, pwd): - self.get_path(pwd) - self.config.read(self.config_path) - self.url = str(self.config.get("main", "url")) - self.filename = self.url.split("/").pop() - self.base_dir = str(self.config.get("main", "base_dir")) - self.blacklists_dir = "%sblacklists" % self.base_dir - self.archive = "%s%s" % ("/tmp/", self.filename) - self.db_backend = str(self.config.get("main", "db_backend")) - self.categories = [] - for cat in self.config.get("main", "categories").split(","): - self.categories.append(str(cat)) - - def set_config(self, section, attr): - self.config.set(section, attr) - - def get_path(self, pwd): - filename = "py-squid-blacklists.conf" - config_path = "%s/%s" % (os.path.dirname(os.path.abspath(pwd)), filename) - default_config_path = "/etc/%s" % filename - - if os.path.exists(config_path): - self.config_path = config_path - elif os.path.exists(default_config_path): - self.config_path = default_config_path - else: - exit("No config file available at common paths (current dir or /etc). Must initialize it")