reworked py-squid-blacklists for python3, renamed pybl

This commit is contained in:
Paul 2020-02-29 13:00:17 +01:00
parent 09390850c4
commit b08aac8b01
14 changed files with 304 additions and 241 deletions

4
.gitignore vendored
View File

@ -65,6 +65,4 @@ target/
blacklists blacklists
blacklists.tar.gz blacklists.tar.gz
py-squid-blacklists.conf py-squid-blacklists.conf
pybl.conf
# Others
.idea/

23
LICENSE
View File

@ -1,23 +0,0 @@
Copyright (c) 2016, Paul Lecuq
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

View File

@ -1,16 +1,19 @@
# py-squid-blacklists # pybl
Squid helper handling squidguard blacklists written in python Squid helper handling squidguard blacklists written in python
* Only supports domains blacklists actually (ie : google.com, www.google.com, mail.google.com, etc.) * Only supports domains blacklists actually (ie : google.com, www.google.com, mail.google.com, etc.)
* In config specified blacklists are loaded in RAM or CDB backend using https://github.com/acg/python-cdb * In config specified blacklists are loaded in memory or CDB backend using https://github.com/bbayles/python-pure-cdb
* Usable as an external acl plugin of squid * Usable as an external acl plugin for squid 3
* Written because of poor development on squidguard and some issues using blacklists on squid3 * Written because of poor development on squidguard and some issues using blacklists on squid
* Python 3 supported as of 2020
## Usage ## Usage
Add this configuration to squid.conf : Add this configuration to squid.conf :
``` ```
external_acl_type urlblacklist_lookup ttl=5 %URI /usr/bin/python /usr/local/py-squid-blacklists/py-squid-blacklists.py external_acl_type urlblacklist_lookup ttl=5 %DST /usr/bin/python /usr/local/pybl/pybl.py
... ...
acl urlblacklist external urlblacklist_lookup acl urlblacklist external urlblacklist_lookup
... ...
@ -20,15 +23,16 @@ http_access deny urlblacklist
Config file must be include following statements Config file must be include following statements
``` ```
url = http://dsi.ut-capitole.fr/blacklists/download/blacklists.tar.gz url = http://dsi.ut-capitole.fr/blacklists/download/blacklists.tar.gz
base_dir = /usr/local/py-squid-blacklists/ basedir = /usr/local/pybl/
categories = adult,malware categories = adult,malware # categories are coma separated values
db_backend = cdb backend = cdb
``` ```
* url : squidguard-like blacklists files, this variable is not already usable * url : squidguard-like blacklists files, this variable is not already usable
* base_dir : root path containing blacklists files, metadata (update datetime) * basedir : root path containing blacklists files, metadata (update datetime)
* categories : blacklists to use for filtering * categories : blacklists to use for filtering
* db_backend : database flavour (ram|cdb) * backend : database flavour (ram|cdb)
## TODO ## TODO
@ -39,6 +43,7 @@ db_backend = cdb
* Tests (wip) * Tests (wip)
* ... * ...
## DBs support ideas ## DBs support ideas
* High performance but heavy RAM usage when using dict() * High performance but heavy RAM usage when using dict()
@ -46,7 +51,7 @@ db_backend = cdb
* CDB backend seems to be as fast as attended, with a very small footprint * CDB backend seems to be as fast as attended, with a very small footprint
## DBs Benchmarks ## DBs Benchmarks (2016)
RAM usage for one thread with categories ["adult","malware"] RAM usage for one thread with categories ["adult","malware"]
@ -54,3 +59,35 @@ Debian 8 / python 2.7.9 / squid 3.4.8
* ram : 90Mo * ram : 90Mo
* cdb : 6Mo * cdb : 6Mo
Ubuntu 20.04 / python 3.8.2 / squid 4.9
* ram : 249Mo
* cdb : 12Mo
## License
Copyright (c) 2016, 2020 PaulBSD
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

View File

@ -1,39 +0,0 @@
#!/usr/bin/env python2.7
import os
import sys
import tarfile
import urllib
from pysquidblacklists import PySquidBlacklistsConfig
print("Parsing configuration file ...")
config = PySquidBlacklistsConfig()
config.get_config()
def download(url, path):
bl_file = urllib.URLopener()
bl_file.retrieve(url, path)
def extract(base_dir, archive):
if not os.path.isdir(base_dir):
bl_file = tarfile.open(archive)
bl_file.extractall(base_dir)
else:
pass
def usage():
print("tool.py import : import blacklists using config file")
if len(sys.argv) > 1:
if sys.argv[1] == "import":
print("Retrieving %s, storing it to %s ..." % (config.url, config.archive))
download(config.url, config.archive)
print("Extracting blacklists to %s" % config.base_dir)
extract(config.base_dir, config.archive)
else:
print(usage())

View File

@ -1,9 +0,0 @@
#!/usr/bin/env python2.7
from pysquidblacklists import *
config = PySquidBlacklistsConfig()
config.get_config(__file__)
bli = PySquidBlacklistsImporter(config)
bl = PySquidBlacklistsRunner(config, bli)
bl.loop()

12
pybl-cmd.py Executable file
View File

@ -0,0 +1,12 @@
#!/usr/bin/env python3
from pybl import *
if __name__ == "__main__":
config = PyBLConfig()
config.getconfig(__file__)
bli = PyBLImporter(config)
bl = PyBLRunner(config, bli)
bl.loop()

48
pybl-tool.py Executable file
View File

@ -0,0 +1,48 @@
#!/usr/bin/env python3
from pybl import PyBLConfig
import os
import sys
import tarfile
import urllib.request
import argparse
def download(url, path):
try:
print("Retrieving {}, storing it to {} ...".format(config.url, config.archive))
filename, _ = urllib.request.urlretrieve(url, filename=path)
print("{} Successfully downloaded".format(filename))
except Exception as exp:
print(exp)
exit(1)
def folder_exists(basedir):
ret = os.path.isdir(basedir)
print("Destination folder {} already exists".format(basedir))
return ret
def extract(basedir, archive):
print("Extracting blacklists to {} ...".format(config.basedir))
bl_file = tarfile.open(archive)
bl_file.extractall(basedir)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Action')
parser.add_argument('--config', metavar='config', type=str, nargs=1, default='pybl.conf', help='path to config file')
parser.add_argument('--force', action="store_true", help='force download and extract archive')
parser.add_argument('action', metavar='action', type=str, help='action blacklists using config file', choices=["import"])
args = parser.parse_args()
if args.action == "import":
config = PyBLConfig()
print("Parsing configuration file ...")
config.getconfig(args.config[0])
exists = folder_exists(config.basedir)
if not exists or args.force:
download(config.url, config.archive)
extract(config.basedir, config.archive)

View File

@ -6,7 +6,7 @@ url = http://dsi.ut-capitole.fr/blacklists/download/blacklists.tar.gz
categories = adult,malware categories = adult,malware
# base directory for blacklist extraction # base directory for blacklist extraction
base_dir = /usr/local/py-squid-blacklists/ basedir = /usr/local/py-squid-blacklists/
# ram | cdb # ram | cdb
db_backend = cdb backend = cdb

47
pybl/PyBLConfig.py Normal file
View File

@ -0,0 +1,47 @@
import configparser
import os
import sys
class PyBLConfig:
def __init__(self):
self.default_filename = "pybl.conf"
self.default_config_path = "/etc/{}".format(self.default_filename)
self.url = None
self.filename = None
self.basedir = ""
self.blacklistsdir = ""
self.archive = ""
self.backend = ""
self.categories = None
self.config = configparser.RawConfigParser()
self.config_path = None
def getconfig(self, pwd):
self.getpath(pwd)
self.config.read(self.config_path)
self.url = str(self.config.get("main", "url"))
self.filename = self.url.split("/").pop()
self.basedir = str(self.config.get("main", "basedir"))
self.blacklistsdir = "{}/blacklists".format(self.basedir)
self.archive = "{}/{}".format("/tmp", self.filename)
self.backend = str(self.config.get("main", "backend"))
self.categories = []
for cat in self.config.get("main", "categories").split(","):
self.categories.append(str(cat))
def setconfig(self, section, attr):
self.config.set(section, attr)
def getpath(self, pwd):
config_path = "{}/{}".format(os.path.dirname(os.path.abspath(pwd)), self.default_filename)
if os.path.exists(config_path):
self.config_path = config_path
elif os.path.exists(self.default_config_path):
self.config_path = self.default_config_path
else:
sys.exit("No config file available at common paths (current dir or /etc). Must initialize it")

71
pybl/PyBLImporter.py Normal file
View File

@ -0,0 +1,71 @@
import os
import re
import cdblib.compat as cdb
class PyBLImporter:
def __init__(self, config):
"""Importer initializer"""
self.backend = config.backend
self.categories = config.categories
self.basedir = config.basedir
self.blacklistsdir = config.blacklistsdir
if os.path.isdir(self.basedir):
self.domain_files = [os.path.join(dp, f) for dp, dn, fn in os.walk(os.path.expanduser(self.blacklistsdir)) for f in fn if re.match(r"domains$", f)]
else:
exit("blacklistsdir doesn't exists. Please update using pybl-tool.py")
self.blacklist_files = self.makelists()
self.cache = None
if self.backend == "ram":
self.makeram()
elif self.backend == "cdb":
self.makecdb()
def makelists(self):
"""Create blacklists of domains"""
blacklists = []
for l in self.domain_files:
splitlist = l.split("/")
list_type = splitlist[len(splitlist) - 2]
blacklists.append([list_type, l])
return blacklists
def makeram(self):
"""Make dict based in-memory database"""
lib = dict()
for bls in self.blacklist_files:
cat = bls[0]
if cat in self.categories:
blc = dict()
f = open(bls[1], 'r')
for l in f:
blc[l.strip("\n")] = True
lib[cat] = blc
del blc
self.cache = lib
def makecdb(self):
"""Make CDB database"""
lib = []
for bl in self.blacklist_files:
bl_cdb_file = "{}/{}.cdb".format(self.basedir, bl[0])
bl_cdb_file_tmp = "{}/{}.tmp".format(self.basedir, bl[0])
if bl[0] in self.categories:
if not os.path.isfile(bl_cdb_file):
cdb_file = cdb.cdbmake(bl_cdb_file, bl_cdb_file_tmp)
f = open(bl[1], "r")
for l in f:
cdb_file.add(l.strip("\n"), "True")
cdb_file.finish()
lib.append(bl_cdb_file)
self.cache = lib
def makepickle(self):
"""Support for key value based PickleDB, not yet implemented"""
lib = []
return lib

72
pybl/PyBLRunner.py Normal file
View File

@ -0,0 +1,72 @@
from urllib.parse import urlparse
import sys
import logging
import logging.handlers
import cdblib.compat as cdb
class PyBLRunner:
def __init__(self, config, bli):
self.basedir = config.basedir
self.backend = config.backend
self.cache = bli.cache
self.cdb_cache = dict()
if self.backend == "ram":
pass
elif self.backend == "cdb":
for blacklist in self.cache:
self.cdb_cache[blacklist] = cdb.init(blacklist)
self.loop()
def domaincompare(self, inputstring):
result = False
for blacklist in self.cache:
tmpline = inputstring
while not result and tmpline != "":
try:
if self.backend == "ram":
result = self.cache[blacklist][tmpline]
elif self.backend == "cdb":
result = self.cdb_cache[blacklist][tmpline]
pass
except KeyError:
pass
tmpline = tmpline.partition('.')[2]
return result
def getfqdn(self, url):
# newurl = urlparse(url)
# return newurl.netloc.rsplit(" -")[0]
return url.rsplit(" -")[0]
def loop(self):
while True:
my_logger = logging.getLogger('MyLogger')
my_logger.setLevel(logging.DEBUG)
handler = logging.handlers.SysLogHandler(address = '/dev/log')
my_logger.addHandler(handler)
try:
line = sys.stdin.readline().strip()
fqdn = self.getfqdn(line)
my_logger.critical("|{}|".format(fqdn))
if line == "":
exit()
if line:
if self.domaincompare(fqdn):
self.response("OK log='{} OK'".format(fqdn))
else:
self.response("ERR log='{} ERR'".format(fqdn))
except IOError:
pass
@staticmethod
def response(r):
sys.stdout.write("{}\n".format(r))
sys.stdout.flush()

3
pybl/__init__.py Normal file
View File

@ -0,0 +1,3 @@
from .PyBLRunner import PyBLRunner
from .PyBLImporter import PyBLImporter
from .PyBLConfig import PyBLConfig

View File

@ -1,3 +0,0 @@
from pysquidblacklists import PySquidBlacklistsRunner
from pysquidblacklists import PySquidBlacklistsImporter
from pysquidblacklists import PySquidBlacklistsConfig

View File

@ -1,151 +0,0 @@
import os
import re
import sys
import configparser
import cdb
from urlparse import urlparse
class PySquidBlacklistsRunner:
def __init__(self, config, bli):
self.base_dir = config.base_dir
self.db_backend = config.db_backend
self.cache = bli.cache
self.cdb_cache = dict()
if self.db_backend == "ram":
pass
elif self.db_backend == "cdb":
for blacklist in self.cache:
self.cdb_cache[blacklist] = cdb.init(blacklist)
self.loop()
def domain_compare(self):
result = False
for blacklist in self.cache:
tmpline = self.outline
while not result and tmpline != "":
try:
if self.db_backend == "ram":
result = self.cache[blacklist][tmpline]
elif self.db_backend == "cdb":
result = self.cdb_cache[blacklist][tmpline]
pass
except KeyError:
pass
tmpline = tmpline.partition('.')[2]
return result
def loop(self):
while True:
try:
line = sys.stdin.readline().strip()
if line == "":
exit()
self.outline = urlparse(line).netloc
if line:
if self.domain_compare():
self.response("OK")
else:
self.response("ERR")
except IOError:
pass
@staticmethod
def response(r):
sys.stdout.write("%s\n" % r)
sys.stdout.flush()
class PySquidBlacklistsImporter:
def __init__(self, config):
self.db_backend = config.db_backend
self.categories = config.categories
self.base_dir = config.base_dir
self.blacklists_dir = config.blacklists_dir
if os.path.isdir(self.base_dir):
self.domain_files = [os.path.join(dp, f) for dp, dn, fn in os.walk(os.path.expanduser(self.blacklists_dir)) for f in
fn if re.match(r"domains*", f)]
else:
exit("blacklists_dir doesn't exists. Please update using py-sb-tool.py")
self.blacklist_files = self.make_list()
self.cache = None
if self.db_backend == "ram":
self.make_ram_db()
elif self.db_backend == "cdb":
self.make_cdb_db()
def make_list(self):
blacklists = []
for l in self.domain_files:
splitlist = l.split("/")
list_type = splitlist[len(splitlist) - 2]
blacklists.append([list_type, l])
return blacklists
def make_ram_db(self):
lib = dict()
for bls in self.blacklist_files:
if bls[0] in self.categories:
blcache = dict()
f = open(bls[1], "r")
for l in f:
blcache[l.strip("\n")] = True
lib[bls[0]] = blcache
del blcache
self.cache = lib
def make_cdb_db(self):
lib = []
for bl in self.blacklist_files:
bl_cdb_file = ("%s/%s.cdb" % (self.base_dir, bl[0]))
bl_cdb_file_tmp = ("%s/%s.tmp" % (self.base_dir, bl[0]))
if bl[0] in self.categories:
if not os.path.isfile(bl_cdb_file):
cdb_file = cdb.cdbmake(bl_cdb_file, bl_cdb_file_tmp)
f = open(bl[1], "r")
for line in f:
cdb_file.add(line.strip("\n"), "True")
cdb_file.finish()
lib.append(bl_cdb_file)
self.cache = lib
class PySquidBlacklistsConfig:
def __init__(self):
self.url = None
self.filename = None
self.base_dir = None
self.blacklists_dir = None
self.archive = None
self.db_backend = None
self.categories = None
self.config = configparser.RawConfigParser()
self.config_path = None
def get_config(self, pwd):
self.get_path(pwd)
self.config.read(self.config_path)
self.url = str(self.config.get("main", "url"))
self.filename = self.url.split("/").pop()
self.base_dir = str(self.config.get("main", "base_dir"))
self.blacklists_dir = "%sblacklists" % self.base_dir
self.archive = "%s%s" % ("/tmp/", self.filename)
self.db_backend = str(self.config.get("main", "db_backend"))
self.categories = []
for cat in self.config.get("main", "categories").split(","):
self.categories.append(str(cat))
def set_config(self, section, attr):
self.config.set(section, attr)
def get_path(self, pwd):
filename = "py-squid-blacklists.conf"
config_path = "%s/%s" % (os.path.dirname(os.path.abspath(pwd)), filename)
default_config_path = "/etc/%s" % filename
if os.path.exists(config_path):
self.config_path = config_path
elif os.path.exists(default_config_path):
self.config_path = default_config_path
else:
exit("No config file available at common paths (current dir or /etc). Must initialize it")