Commit a03cc14f authored by Ad Schellevis's avatar Ad Schellevis

(proxy) improve remote acl handling, sort and structure output for squid.

parent 78c3d912
#!/usr/local/bin/python2.7 #!/usr/local/bin/python2.7
""" """
Copyright (c) 2016 Ad Schellevis - Deciso B.V.
Copyright (c) 2015 Jos Schellevis - Deciso B.V. Copyright (c) 2015 Jos Schellevis - Deciso B.V.
All rights reserved. All rights reserved.
...@@ -26,6 +27,7 @@ ...@@ -26,6 +27,7 @@
POSSIBILITY OF SUCH DAMAGE. POSSIBILITY OF SUCH DAMAGE.
""" """
import tempfile
import urllib2 import urllib2
import os import os
import json import json
...@@ -42,54 +44,65 @@ acl_config_fn = ('/usr/local/etc/squid/externalACLs.conf') ...@@ -42,54 +44,65 @@ acl_config_fn = ('/usr/local/etc/squid/externalACLs.conf')
acl_target_dir = ('/usr/local/etc/squid/acl') acl_target_dir = ('/usr/local/etc/squid/acl')
acl_max_timeout = 30 acl_max_timeout = 30
class ACLDownload(object): class Downloader(object):
""" Download helper
"""
def __init__(self, url, timeout): def __init__(self, url, timeout):
""" init new """ init new
:param url: source url
:param timeout: timeout in seconds
""" """
self._url = url self._url = url
self._timeout = timeout self._timeout = timeout
self._source_data = None self._source_handle = None
self._target_data = None self._target_data = None
def fetch(self): def fetch(self):
""" fetch (raw) source data into self._source_data """ fetch (raw) source data into tempfile using self._source_handle
""" """
try: try:
f = urllib2.urlopen(self._url,timeout = self._timeout) f = urllib2.urlopen(self._url, timeout = self._timeout)
self._source_data = f.read() # flush to temp file
self._source_handle = tempfile.NamedTemporaryFile()
while True:
data = f.read(1024)
if not data:
break
else:
self._source_handle.write(data)
self._source_handle.seek(0)
f.close() f.close()
except (urllib2.URLError, urllib2.HTTPError, IOError) as e: except (urllib2.URLError, urllib2.HTTPError, IOError) as e:
syslog.syslog(syslog.LOG_ERR, 'proxy acl: error downloading %s'%self._url) syslog.syslog(syslog.LOG_ERR, 'proxy acl: error downloading %s'%self._url)
self._source_data = None self._source_handle = None
def get_files(self): def get_files(self):
""" process downloaded data, handle compression """ process downloaded data, handle compression
:return: iterator filename, content :return: iterator filename, file handle
""" """
if self._source_data is not None: if self._source_handle is not None:
# handle compressed data # handle compressed data
if (len(self._url) > 8 and self._url[-7:] == '.tar.gz') \ if (len(self._url) > 8 and self._url[-7:] == '.tar.gz') \
or (len(self._url) > 4 and self._url[-4:] == '.tgz'): or (len(self._url) > 4 and self._url[-4:] == '.tgz'):
# source is in tar.gz format, extract all into a single string # source is in tar.gz format, extract all into a single string
try: try:
tf = tarfile.open(fileobj=StringIO.StringIO(self._source_data)) tf = tarfile.open(fileobj=self._source_handle)
for tf_file in tf.getmembers(): for tf_file in tf.getmembers():
if tf_file.isfile(): if tf_file.isfile():
yield tf_file.name, tf.extractfile(tf_file).read() yield tf_file.name, tf.extractfile(tf_file)
except IOError as e: except IOError as e:
syslog.syslog(syslog.LOG_ERR, 'proxy acl: error downloading %s (%s)'%(self._url, e)) syslog.syslog(syslog.LOG_ERR, 'proxy acl: error downloading %s (%s)'%(self._url, e))
elif len(self._url) > 4 and self._url[-3:] == '.gz': elif len(self._url) > 4 and self._url[-3:] == '.gz':
# source is in .gz format unpack # source is in .gz format unpack
try: try:
gf = gzip.GzipFile(mode='r', fileobj=StringIO.StringIO(self._source_data)) gf = gzip.GzipFile(mode='r', fileobj=self._source_handle)
yield os.path.basename(self._url), gf.read() yield os.path.basename(self._url), gf
except IOError as e: except IOError as e:
syslog.syslog(syslog.LOG_ERR, 'proxy acl: error downloading %s (%s)'%(self._url, e)) syslog.syslog(syslog.LOG_ERR, 'proxy acl: error downloading %s (%s)'%(self._url, e))
elif len(self._url) > 5 and self._url[-4:] == '.zip': elif len(self._url) > 5 and self._url[-4:] == '.zip':
# source is in .zip format, extract all into a single string # source is in .zip format, extract all into a single string
target_data = dict() target_data = dict()
with zipfile.ZipFile(StringIO.StringIO(self._source_data), with zipfile.ZipFile(self._source_handle,
mode='r', mode='r',
compression=zipfile.ZIP_DEFLATED) as zf: compression=zipfile.ZIP_DEFLATED) as zf:
for item in zf.infolist(): for item in zf.infolist():
...@@ -97,20 +110,121 @@ class ACLDownload(object): ...@@ -97,20 +110,121 @@ class ACLDownload(object):
yield item.filename, zf.read(item) yield item.filename, zf.read(item)
self._target_data = target_data self._target_data = target_data
else: else:
yield os.path.basename(self._url), self._source_data yield os.path.basename(self._url), self._source_handle
def download(self): def download(self):
""" download / unpack ACL """ download / unpack ACL
:return: iterator filename, type, content :return: iterator filename, type, content
""" """
self.fetch() self.fetch()
for filename, filedata in self.get_files(): for filename, filehandle in self.get_files():
for line in filedata.split('\n'): while True:
if line.find('/') > -1: line = filehandle.readline()
file_type = 'url' if not line:
break
yield filename, line
class DomainSorter(object):
""" Helper class for building sorted squid domain acl list.
Use as file type object, close flushes the actual (sorted) data to disc
"""
def __init__(self, filename=None, mode=None):
""" new sorted output file, uses an acl record in reverse order as sort key
:param filename: target filename
:param mode: file open mode
"""
self._num_targets = 20
self._seperator = '|'
self._buckets = dict()
self._sort_map = dict()
# setup target
self._target_filename = filename
self._target_mode = mode
# setup temp files
self.generate_targets()
def generate_targets(self):
""" generate ordered targets
"""
sets = 255
for i in range(sets):
target = chr(i+1)
setid = int(i / (sets / self._num_targets))
if setid not in self._buckets:
self._buckets[setid] = tempfile.NamedTemporaryFile()
self._sort_map[target] = self._buckets[setid]
def write(self, data):
""" save content, send reverse sorted to buffers
:param data: line to write
"""
line = data.strip()
if len(line) > 0:
self.add(line[::-1], line)
def add(self, key, value):
""" spool data to temp
:param key: key to use
:param value: value to store
"""
target = key[0]
if target in self._sort_map:
self._sort_map[target].write('%s%s%s\n'%(key, self._seperator, value))
else:
# not supposed to happen, every key should have a calculated target pool
pass
def reader(self):
""" read reverse
"""
for target in sorted(self._buckets):
self._buckets[target].seek(0)
set_content = dict()
while True:
line = self._buckets[target].readline()
if not line:
break
else: else:
file_type = 'domain' set_content[line.split('|')[0]] = '|'.join(line.split('|')[1:])
yield filename, file_type, line for itemkey in sorted(set_content, reverse=True):
yield set_content[itemkey]
@staticmethod
def is_domain(tag):
""" check if tag is probably a domain name
:param tag: tag to inspect
:return: boolean
"""
has_chars = False
for tag_item in tag:
if not tag_item.isdigit() and tag_item not in ('.', ',', '|', '/', '\n'):
has_chars = True
elif tag_item in (':', '|', '/'):
return False
if has_chars:
return True
else:
return False
def close(self):
""" close and dump content
"""
if self._target_filename is not None and self._target_mode is not None:
# flush to file on close
with open(self._target_filename, self._target_mode) as f_out:
prev_line = None
for line in self.reader():
line = line.lstrip('.')
if prev_line == line:
# duplicate, skip
continue
if self.is_domain(line):
# prefix domain, but only if the chances are very small it will overlap
if prev_line is None or line not in prev_line:
f_out.write('.')
f_out.write(line)
prev_line = line
def filename_in_ignorelist(filename): def filename_in_ignorelist(filename):
...@@ -119,82 +233,90 @@ def filename_in_ignorelist(filename): ...@@ -119,82 +233,90 @@ def filename_in_ignorelist(filename):
""" """
if (filename.lower().split('.')[-1] in ['pdf', 'txt', 'doc']): if (filename.lower().split('.')[-1] in ['pdf', 'txt', 'doc']):
return True return True
elif (filename.lower() in ('readme', 'license')): elif (filename.lower() in ('readme', 'license', 'usage', 'categories')):
return True return True
return False return False
# parse OPNsense external ACLs config def main():
if os.path.exists(acl_config_fn): # parse OPNsense external ACLs config
# create acl directory (if new) if os.path.exists(acl_config_fn):
if not os.path.exists(acl_target_dir): # create acl directory (if new)
os.mkdir(acl_target_dir) if not os.path.exists(acl_target_dir):
else: os.mkdir(acl_target_dir)
# remove index files else:
for filename in glob.glob('%s/*.index'%acl_target_dir): # remove index files
os.remove(filename) for filename in glob.glob('%s/*.index'%acl_target_dir):
# read config and download per section os.remove(filename)
cnf = ConfigParser() # read config and download per section
cnf.read(acl_config_fn) cnf = ConfigParser()
for section in cnf.sections(): cnf.read(acl_config_fn)
target_filename = acl_target_dir+'/'+section for section in cnf.sections():
if cnf.has_option(section,'url'): target_filename = acl_target_dir+'/'+section
# collect filters to apply if cnf.has_option(section,'url'):
acl_filters = list() # collect filters to apply
if cnf.has_option(section,'filter'): acl_filters = list()
for acl_filter in cnf.get(section,'filter').strip().split(','): if cnf.has_option(section,'filter'):
if len(acl_filter.strip()) > 0: for acl_filter in cnf.get(section,'filter').strip().split(','):
acl_filters.append(acl_filter) if len(acl_filter.strip()) > 0:
acl_filters.append(acl_filter)
# define targets
targets = {'domain': {'filename': target_filename, 'handle' : None}, # define target(s)
'url': {'filename': '%s.url'%target_filename, 'handle': None}} targets = {'domain': {'filename': target_filename, 'handle' : None, 'class': DomainSorter}}
# only generate files if enabled, otherwise dump empty files
if cnf.has_option(section,'enabled') and cnf.get(section,'enabled') == '1':
download_url = cnf.get(section,'url')
acl = ACLDownload(download_url, acl_max_timeout)
all_filenames = list()
for filename, filetype, line in acl.download():
if filename_in_ignorelist(os.path.basename(filename)):
# ignore documents, licenses and readme's
continue
if filename not in all_filenames: # only generate files if enabled, otherwise dump empty files
all_filenames.append(filename) if cnf.has_option(section,'enabled') and cnf.get(section,'enabled') == '1':
download_url = cnf.get(section,'url')
if len(acl_filters) > 0: acl = Downloader(download_url, acl_max_timeout)
acl_found = False all_filenames = list()
for acl_filter in acl_filters: for filename, line in acl.download():
if filename.find(acl_filter) > -1: if filename_in_ignorelist(os.path.basename(filename)):
acl_found = True # ignore documents, licenses and readme's
break
if not acl_found:
# skip this acl entry
continue continue
if filetype in targets and targets[filetype]['handle'] is None: # detect output type
targets[filetype]['handle'] = open(targets[filetype]['filename'], 'wb') if '/' in line or '|' in line:
if filetype in targets: file_type = 'url'
targets[filetype]['handle'].write('%s\n'%line) else:
# save index to disc file_type = 'domain'
with open('%s.index'%target_filename,'wb') as idx_out:
index_data = dict() if filename not in all_filenames:
for filename in all_filenames: all_filenames.append(filename)
if len(filename.split('/')) > 2:
index_key = '/'.join(filename.split('/')[1:-1]) if len(acl_filters) > 0:
if index_key not in index_data: acl_found = False
index_data[index_key] = index_key for acl_filter in acl_filters:
idx_out.write(json.dumps(index_data)) if acl_filter in filename:
acl_found = True
# cleanup break
for filetype in targets: if not acl_found:
if targets[filetype]['handle'] is not None: # skip this acl entry
targets[filetype]['handle'].close() continue
elif cnf.has_option(section,'enabled') and cnf.get(section,'enabled') != '1':
if os.path.isfile(targets[filetype]['filename']): if filetype in targets and targets[filetype]['handle'] is None:
# disabled, remove previous data targets[filetype]['handle'] = targets[filetype]['class'](targets[filetype]['filename'], 'wb')
os.remove(targets[filetype]['filename']) if filetype in targets:
elif not os.path.isfile(targets[filetype]['filename']): targets[filetype]['handle'].write('%s\n'%line)
# no data fetched and no file available, create new empty file # save index to disc
with open(targets[filetype]['filename'], 'wb') as target_out: with open('%s.index'%target_filename,'wb') as idx_out:
target_out.write("") index_data = dict()
for filename in all_filenames:
if len(filename.split('/')) > 2:
index_key = '/'.join(filename.split('/')[1:-1])
if index_key not in index_data:
index_data[index_key] = index_key
idx_out.write(json.dumps(index_data))
# cleanup
for filetype in targets:
if targets[filetype]['handle'] is not None:
targets[filetype]['handle'].close()
elif cnf.has_option(section,'enabled') and cnf.get(section,'enabled') != '1':
if os.path.isfile(targets[filetype]['filename']):
# disabled, remove previous data
os.remove(targets[filetype]['filename'])
elif not os.path.isfile(targets[filetype]['filename']):
# no data fetched and no file available, create new empty file
with open(targets[filetype]['filename'], 'wb') as target_out:
target_out.write("")
# execute downloader
main()
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment