Compare commits

..

No commits in common. "master" and "v1.0" have entirely different histories.
master ... v1.0

12 changed files with 175 additions and 327 deletions

5
.gitignore vendored
View File

@ -61,8 +61,7 @@ target/
#Ipython Notebook
.ipynb_checkpoints
# Assets
config.py
blacklists
blacklists.tar.gz
py-squid-blacklists.conf
pybl.conf
pysquidblacklists/

23
LICENSE Normal file
View File

@ -0,0 +1,23 @@
Copyright (c) 2016, Paul Lecuq
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

View File

@ -1,50 +1,46 @@
# pybl
# py-squid-blacklists
Squid helper handling squidguard blacklists written in python
* Only supports domains blacklists actually (ie : google.com, www.google.com, mail.google.com, etc.)
* In config specified blacklists are loaded in memory or CDB backend using https://github.com/bbayles/python-pure-cdb
* Usable as an external acl plugin for squid 3
* Written because of poor development on squidguard and some issues using blacklists on squid
* Python 3 supported as of 2020
* Similar tool in Golang will replace this one in the future
* In config specified blacklists are loaded in RAM or CDB backend using https://github.com/acg/python-cdb
* Usable as an external acl plugin of squid
* Written because of poor developpement on squidguard and some issues using blacklists on squid3
## Usage
## Usage
Add this configuration to squid.conf :
```
external_acl_type urlblacklist_lookup ttl=5 %DST /usr/bin/python /usr/local/pybl/pybl.py
external_acl_type urlblacklist_lookup ttl=5 %URI /usr/bin/python /usr/local/py-squid-blacklists/py-squid-blacklists.py
...
acl urlblacklist external urlblacklist_lookup
...
http_access deny urlblacklist
```
Config file must be include following statements
config.py file must be include following statements
```
url = http://dsi.ut-capitole.fr/blacklists/download/blacklists.tar.gz
basedir = /usr/local/pybl/
categories = adult,malware # categories are coma separated values
backend = cdb
url = "http://dsi.ut-capitole.fr/blacklists/download/blacklists.tar.gz"
base_dir = "/usr/local/py-squid-blacklists/blacklists/"
categories = ["adult","malware"]
db_backend = "ram"
```
* url : squidguard-like blacklists files, this variable is not already usable
* basedir : root path containing blacklists files, metadata (update datetime)
* categories : blacklists to use for filtering
* backend : database flavour (ram|cdb)
* base_dir : path containing blacklists files
* db_backend : database flavour (ram|cdb)
## TODO
* Auto-fetcher using url if blacklists are not already downloaded or stored on the squid machine (wip)
* Auto-fetcher using url if blacklists are not already downloaded or stored on the squid machine
* Compatibility with python3 only
* Filters for regex urls
* Reduce memory footprint (wip with CDB backend alternative)
* Code optimisation (profiling) and cleaning (wip)
* Object oriented programming (wip)
* Tests (wip)
* ...
## DBs support ideas
* High performance but heavy RAM usage when using dict()
@ -52,48 +48,11 @@ backend = cdb
* CDB backend seems to be as fast as attended, with a very small footprint
## DBs Benchmarks (2016)
## DBs Benchmarks
RAM usage for one thread with categories ["adult","malware"]
RAM usage For one thread with categories ["adult","malware"]
Debian 8 / python 2.7.9 / squid 3.4.8
* ram : 90Mo
* cdb : 6Mo
Ubuntu 20.04 / python 3.8.2 / squid 4.9
* ram : 249Mo
* cdb : 12Mo
## License
```text
Copyright (c) 2020 PaulBSD
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
1. Redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
The views and conclusions contained in the software and documentation are those
of the authors and should not be interpreted as representing official policies,
either expressed or implied, of this project.
```

11
config.py.sample Normal file
View File

@ -0,0 +1,11 @@
# url to retrieve blacklists
url = "http://dsi.ut-capitole.fr/blacklists/download/blacklists.tar.gz"
# categories
categories = ["adult", "malware"]
# base directory for blacklists
base_dir = "/usr/local/py-squid-blacklists/blacklists/"
# ram | cdb
db_backend = "cdb"

121
py-squid-blacklists.py Executable file
View File

@ -0,0 +1,121 @@
#!/usr/bin/env python2.7
import sys
import os
import re
from urlparse import urlparse
try:
import config
except ImportError:
print("Please create config.py using config.py.sample")
exit()
try:
import cdb
except ImportError:
print("Please install python-cdb from pypi or via package manager")
exit()
class PySquidBlacklists:
def __init__(self, config, bli):
self.base_dir = config.base_dir
self.db_backend = config.db_backend
self.cache = bli.cache
self.cdb_cache = dict()
for blacklist in self.cache:
if self.db_backend == "ram":
pass
elif self.db_backend == "cdb":
self.cdb_cache[blacklist] = cdb.init(blacklist)
self.loop()
def domain_compare(self):
result = False
for blacklist in self.cache:
tmpline = self.outline
while not result and tmpline != "":
try:
if self.db_backend == "ram":
result = self.cache[blacklist][tmpline]
elif self.db_backend == "cdb":
result = self.cdb_cache[blacklist][tmpline]
pass
except KeyError:
pass
tmpline = tmpline.partition('.')[2]
return result
def loop(self):
while True:
try:
line = sys.stdin.readline().strip()
if line == "":
exit()
self.outline = urlparse(line).netloc
if line:
if self.domain_compare():
self.response("OK")
else:
self.response("ERR")
except IOError:
pass
@staticmethod
def response(r):
sys.stdout.write("%s\n" % r)
sys.stdout.flush()
class PySquidBlacklistsImporter:
def __init__(self, conf):
self.db_backend = config.db_backend
self.categories = config.categories
self.base_dir = config.base_dir
self.domain_files = [os.path.join(dp, f) for dp, dn, fn in os.walk(os.path.expanduser(self.base_dir)) for f in
fn if re.match(r"domains*", f)]
self.blacklist_files = self.make_list()
self.cache = None
if self.db_backend == "ram":
self.make_ram_db()
elif self.db_backend == "cdb":
self.make_cdb_db()
def make_list(self):
blacklists = []
for l in self.domain_files:
splitlist = l.split("/")
list_type = splitlist[len(splitlist) - 2]
blacklists.append([list_type, l])
return blacklists
def make_ram_db(self):
lib = dict()
for bls in self.blacklist_files:
if bls[0] in self.categories:
blcache = dict()
f = open(bls[1], "r")
for l in f:
blcache[l.strip("\n")] = True
lib[bls[0]] = blcache
del blcache
self.cache = lib
def make_cdb_db(self):
lib = []
for bl in self.blacklist_files:
bl_cdb_file = ("%s/%s.cdb" % (self.base_dir, bl[0]))
bl_cdb_file_tmp = ("%s/%s.tmp" % (self.base_dir, bl[0]))
if bl[0] in self.categories:
if not os.path.isfile(bl_cdb_file):
cdb_file = cdb.cdbmake(bl_cdb_file, bl_cdb_file_tmp)
f = open(bl[1], "r")
for line in f:
cdb_file.add(line.strip("\n"), "True")
cdb_file.finish()
lib.append(bl_cdb_file)
self.cache = lib
bli = PySquidBlacklistsImporter(config)
bl = PySquidBlacklists(config, bli)

View File

@ -1,12 +0,0 @@
#!/usr/bin/env python3
from pybl import *
if __name__ == "__main__":
config = PyBLConfig()
config.getconfig(__file__)
bli = PyBLImporter(config)
bl = PyBLRunner(config, bli)
bl.loop()

View File

@ -1,48 +0,0 @@
#!/usr/bin/env python3
from pybl import PyBLConfig
import os
import sys
import tarfile
import urllib.request
import argparse
def download(url, path):
try:
print("Retrieving {}, storing it to {} ...".format(config.url, config.archive))
filename, _ = urllib.request.urlretrieve(url, filename=path)
print("{} Successfully downloaded".format(filename))
except Exception as exp:
print(exp)
exit(1)
def folder_exists(basedir):
ret = os.path.isdir(basedir)
print("Destination folder {} already exists".format(basedir))
return ret
def extract(basedir, archive):
print("Extracting blacklists to {} ...".format(config.basedir))
bl_file = tarfile.open(archive)
bl_file.extractall(basedir)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Action')
parser.add_argument('--config', metavar='config', type=str, nargs=1, default='pybl.conf', help='path to config file')
parser.add_argument('--force', action="store_true", help='force download and extract archive')
parser.add_argument('action', metavar='action', type=str, help='action blacklists using config file', choices=["import"])
args = parser.parse_args()
if args.action == "import":
config = PyBLConfig()
print("Parsing configuration file ...")
config.getconfig(args.config[0])
exists = folder_exists(config.basedir)
if not exists or args.force:
download(config.url, config.archive)
extract(config.basedir, config.archive)

View File

@ -1,12 +0,0 @@
[main]
# url to retrieve blacklists
url = http://dsi.ut-capitole.fr/blacklists/download/blacklists.tar.gz
# categories
categories = adult,malware
# base directory for blacklist extraction
basedir = /usr/local/py-squid-blacklists/
# ram | cdb
backend = cdb

View File

@ -1,47 +0,0 @@
import configparser
import os
import sys
class PyBLConfig:
def __init__(self):
self.default_filename = "pybl.conf"
self.default_config_path = "/etc/{}".format(self.default_filename)
self.url = None
self.filename = None
self.basedir = ""
self.blacklistsdir = ""
self.archive = ""
self.backend = ""
self.categories = None
self.config = configparser.RawConfigParser()
self.config_path = None
def getconfig(self, pwd):
self.getpath(pwd)
self.config.read(self.config_path)
self.url = str(self.config.get("main", "url"))
self.filename = self.url.split("/").pop()
self.basedir = str(self.config.get("main", "basedir"))
self.blacklistsdir = "{}/blacklists".format(self.basedir)
self.archive = "{}/{}".format("/tmp", self.filename)
self.backend = str(self.config.get("main", "backend"))
self.categories = []
for cat in self.config.get("main", "categories").split(","):
self.categories.append(str(cat))
def setconfig(self, section, attr):
self.config.set(section, attr)
def getpath(self, pwd):
config_path = "{}/{}".format(os.path.dirname(os.path.abspath(pwd)), self.default_filename)
if os.path.exists(config_path):
self.config_path = config_path
elif os.path.exists(self.default_config_path):
self.config_path = self.default_config_path
else:
sys.exit("No config file available at common paths (current dir or /etc). Must initialize it")

View File

@ -1,71 +0,0 @@
import os
import re
import cdblib.compat as cdb
class PyBLImporter:
def __init__(self, config):
"""Importer initializer"""
self.backend = config.backend
self.categories = config.categories
self.basedir = config.basedir
self.blacklistsdir = config.blacklistsdir
if os.path.isdir(self.basedir):
self.domain_files = [os.path.join(dp, f) for dp, dn, fn in os.walk(os.path.expanduser(self.blacklistsdir)) for f in fn if re.match(r"domains$", f)]
else:
exit("blacklistsdir doesn't exists. Please update using pybl-tool.py")
self.blacklist_files = self.makelists()
self.cache = None
if self.backend == "ram":
self.makeram()
elif self.backend == "cdb":
self.makecdb()
def makelists(self):
"""Create blacklists of domains"""
blacklists = []
for l in self.domain_files:
splitlist = l.split("/")
list_type = splitlist[len(splitlist) - 2]
blacklists.append([list_type, l])
return blacklists
def makeram(self):
"""Make dict based in-memory database"""
lib = dict()
for bls in self.blacklist_files:
cat = bls[0]
if cat in self.categories:
blc = dict()
f = open(bls[1], 'r')
for l in f:
blc[l.strip("\n")] = True
lib[cat] = blc
del blc
self.cache = lib
def makecdb(self):
"""Make CDB database"""
lib = []
for bl in self.blacklist_files:
bl_cdb_file = "{}/{}.cdb".format(self.basedir, bl[0])
bl_cdb_file_tmp = "{}/{}.tmp".format(self.basedir, bl[0])
if bl[0] in self.categories:
if not os.path.isfile(bl_cdb_file):
cdb_file = cdb.cdbmake(bl_cdb_file, bl_cdb_file_tmp)
f = open(bl[1], "r")
for l in f:
cdb_file.add(l.strip("\n"), "True")
cdb_file.finish()
lib.append(bl_cdb_file)
self.cache = lib
def makepickle(self):
"""Support for key value based PickleDB, not yet implemented"""
lib = []
return lib

View File

@ -1,72 +0,0 @@
from urllib.parse import urlparse
import sys
import logging
import logging.handlers
import cdblib.compat as cdb
class PyBLRunner:
def __init__(self, config, bli):
self.basedir = config.basedir
self.backend = config.backend
self.cache = bli.cache
self.cdb_cache = dict()
if self.backend == "ram":
pass
elif self.backend == "cdb":
for blacklist in self.cache:
self.cdb_cache[blacklist] = cdb.init(blacklist)
self.loop()
def domaincompare(self, inputstring):
result = False
for blacklist in self.cache:
tmpline = inputstring
while not result and tmpline != "":
try:
if self.backend == "ram":
result = self.cache[blacklist][tmpline]
elif self.backend == "cdb":
result = self.cdb_cache[blacklist][tmpline]
pass
except KeyError:
pass
tmpline = tmpline.partition('.')[2]
return result
def getfqdn(self, url):
# newurl = urlparse(url)
# return newurl.netloc.rsplit(" -")[0]
return url.rsplit(" -")[0]
def loop(self):
while True:
my_logger = logging.getLogger('MyLogger')
my_logger.setLevel(logging.DEBUG)
handler = logging.handlers.SysLogHandler(address = '/dev/log')
my_logger.addHandler(handler)
try:
line = sys.stdin.readline().strip()
fqdn = self.getfqdn(line)
my_logger.critical("|{}|".format(fqdn))
if line == "":
exit()
if line:
if self.domaincompare(fqdn):
self.response("OK log='{} OK'".format(fqdn))
else:
self.response("ERR log='{} ERR'".format(fqdn))
except IOError:
pass
@staticmethod
def response(r):
sys.stdout.write("{}\n".format(r))
sys.stdout.flush()

View File

@ -1,3 +0,0 @@
from .PyBLRunner import PyBLRunner
from .PyBLImporter import PyBLImporter
from .PyBLConfig import PyBLConfig