Commit d1701ff7 authored by Ad Schellevis's avatar Ad Schellevis Committed by Franco Fichtner

(proxy) improve remote acl handling, sort and structure output for squid.

(cherry picked from commit a03cc14f)
parent 4393cdf8
#!/usr/local/bin/python2.7 #!/usr/local/bin/python2.7
""" """
Copyright (c) 2016 Ad Schellevis - Deciso B.V.
Copyright (c) 2015 Jos Schellevis - Deciso B.V. Copyright (c) 2015 Jos Schellevis - Deciso B.V.
All rights reserved. All rights reserved.
...@@ -26,6 +27,7 @@ ...@@ -26,6 +27,7 @@
POSSIBILITY OF SUCH DAMAGE. POSSIBILITY OF SUCH DAMAGE.
""" """
import tempfile
import urllib2 import urllib2
import os import os
import json import json
...@@ -42,54 +44,65 @@ acl_config_fn = ('/usr/local/etc/squid/externalACLs.conf') ...@@ -42,54 +44,65 @@ acl_config_fn = ('/usr/local/etc/squid/externalACLs.conf')
acl_target_dir = ('/usr/local/etc/squid/acl') acl_target_dir = ('/usr/local/etc/squid/acl')
acl_max_timeout = 30 acl_max_timeout = 30
class ACLDownload(object): class Downloader(object):
""" Download helper
"""
def __init__(self, url, timeout): def __init__(self, url, timeout):
""" init new """ init new
:param url: source url
:param timeout: timeout in seconds
""" """
self._url = url self._url = url
self._timeout = timeout self._timeout = timeout
self._source_data = None self._source_handle = None
self._target_data = None self._target_data = None
def fetch(self): def fetch(self):
""" fetch (raw) source data into self._source_data """ fetch (raw) source data into tempfile using self._source_handle
""" """
try: try:
f = urllib2.urlopen(self._url,timeout = self._timeout) f = urllib2.urlopen(self._url, timeout = self._timeout)
self._source_data = f.read() # flush to temp file
self._source_handle = tempfile.NamedTemporaryFile()
while True:
data = f.read(1024)
if not data:
break
else:
self._source_handle.write(data)
self._source_handle.seek(0)
f.close() f.close()
except (urllib2.URLError, urllib2.HTTPError, IOError) as e: except (urllib2.URLError, urllib2.HTTPError, IOError) as e:
syslog.syslog(syslog.LOG_ERR, 'proxy acl: error downloading %s'%self._url) syslog.syslog(syslog.LOG_ERR, 'proxy acl: error downloading %s'%self._url)
self._source_data = None self._source_handle = None
def get_files(self): def get_files(self):
""" process downloaded data, handle compression """ process downloaded data, handle compression
:return: iterator filename, content :return: iterator filename, file handle
""" """
if self._source_data is not None: if self._source_handle is not None:
# handle compressed data # handle compressed data
if (len(self._url) > 8 and self._url[-7:] == '.tar.gz') \ if (len(self._url) > 8 and self._url[-7:] == '.tar.gz') \
or (len(self._url) > 4 and self._url[-4:] == '.tgz'): or (len(self._url) > 4 and self._url[-4:] == '.tgz'):
# source is in tar.gz format, extract all into a single string # source is in tar.gz format, extract all into a single string
try: try:
tf = tarfile.open(fileobj=StringIO.StringIO(self._source_data)) tf = tarfile.open(fileobj=self._source_handle)
for tf_file in tf.getmembers(): for tf_file in tf.getmembers():
if tf_file.isfile(): if tf_file.isfile():
yield tf_file.name, tf.extractfile(tf_file).read() yield tf_file.name, tf.extractfile(tf_file)
except IOError as e: except IOError as e:
syslog.syslog(syslog.LOG_ERR, 'proxy acl: error downloading %s (%s)'%(self._url, e)) syslog.syslog(syslog.LOG_ERR, 'proxy acl: error downloading %s (%s)'%(self._url, e))
elif len(self._url) > 4 and self._url[-3:] == '.gz': elif len(self._url) > 4 and self._url[-3:] == '.gz':
# source is in .gz format unpack # source is in .gz format unpack
try: try:
gf = gzip.GzipFile(mode='r', fileobj=StringIO.StringIO(self._source_data)) gf = gzip.GzipFile(mode='r', fileobj=self._source_handle)
yield os.path.basename(self._url), gf.read() yield os.path.basename(self._url), gf
except IOError as e: except IOError as e:
syslog.syslog(syslog.LOG_ERR, 'proxy acl: error downloading %s (%s)'%(self._url, e)) syslog.syslog(syslog.LOG_ERR, 'proxy acl: error downloading %s (%s)'%(self._url, e))
elif len(self._url) > 5 and self._url[-4:] == '.zip': elif len(self._url) > 5 and self._url[-4:] == '.zip':
# source is in .zip format, extract all into a single string # source is in .zip format, extract all into a single string
target_data = dict() target_data = dict()
with zipfile.ZipFile(StringIO.StringIO(self._source_data), with zipfile.ZipFile(self._source_handle,
mode='r', mode='r',
compression=zipfile.ZIP_DEFLATED) as zf: compression=zipfile.ZIP_DEFLATED) as zf:
for item in zf.infolist(): for item in zf.infolist():
...@@ -97,20 +110,121 @@ class ACLDownload(object): ...@@ -97,20 +110,121 @@ class ACLDownload(object):
yield item.filename, zf.read(item) yield item.filename, zf.read(item)
self._target_data = target_data self._target_data = target_data
else: else:
yield os.path.basename(self._url), self._source_data yield os.path.basename(self._url), self._source_handle
def download(self): def download(self):
""" download / unpack ACL """ download / unpack ACL
:return: iterator filename, type, content :return: iterator filename, type, content
""" """
self.fetch() self.fetch()
for filename, filedata in self.get_files(): for filename, filehandle in self.get_files():
for line in filedata.split('\n'): while True:
if line.find('/') > -1: line = filehandle.readline()
file_type = 'url' if not line:
break
yield filename, line
class DomainSorter(object):
""" Helper class for building sorted squid domain acl list.
Use as file type object, close flushes the actual (sorted) data to disc
"""
def __init__(self, filename=None, mode=None):
""" new sorted output file, uses an acl record in reverse order as sort key
:param filename: target filename
:param mode: file open mode
"""
self._num_targets = 20
self._seperator = '|'
self._buckets = dict()
self._sort_map = dict()
# setup target
self._target_filename = filename
self._target_mode = mode
# setup temp files
self.generate_targets()
def generate_targets(self):
""" generate ordered targets
"""
sets = 255
for i in range(sets):
target = chr(i+1)
setid = int(i / (sets / self._num_targets))
if setid not in self._buckets:
self._buckets[setid] = tempfile.NamedTemporaryFile()
self._sort_map[target] = self._buckets[setid]
def write(self, data):
""" save content, send reverse sorted to buffers
:param data: line to write
"""
line = data.strip()
if len(line) > 0:
self.add(line[::-1], line)
def add(self, key, value):
""" spool data to temp
:param key: key to use
:param value: value to store
"""
target = key[0]
if target in self._sort_map:
self._sort_map[target].write('%s%s%s\n'%(key, self._seperator, value))
else:
# not supposed to happen, every key should have a calculated target pool
pass
def reader(self):
""" read reverse
"""
for target in sorted(self._buckets):
self._buckets[target].seek(0)
set_content = dict()
while True:
line = self._buckets[target].readline()
if not line:
break
else: else:
file_type = 'domain' set_content[line.split('|')[0]] = '|'.join(line.split('|')[1:])
yield filename, file_type, line for itemkey in sorted(set_content, reverse=True):
yield set_content[itemkey]
@staticmethod
def is_domain(tag):
""" check if tag is probably a domain name
:param tag: tag to inspect
:return: boolean
"""
has_chars = False
for tag_item in tag:
if not tag_item.isdigit() and tag_item not in ('.', ',', '|', '/', '\n'):
has_chars = True
elif tag_item in (':', '|', '/'):
return False
if has_chars:
return True
else:
return False
def close(self):
""" close and dump content
"""
if self._target_filename is not None and self._target_mode is not None:
# flush to file on close
with open(self._target_filename, self._target_mode) as f_out:
prev_line = None
for line in self.reader():
line = line.lstrip('.')
if prev_line == line:
# duplicate, skip
continue
if self.is_domain(line):
# prefix domain, but only if the chances are very small it will overlap
if prev_line is None or line not in prev_line:
f_out.write('.')
f_out.write(line)
prev_line = line
def filename_in_ignorelist(filename): def filename_in_ignorelist(filename):
...@@ -119,82 +233,90 @@ def filename_in_ignorelist(filename): ...@@ -119,82 +233,90 @@ def filename_in_ignorelist(filename):
""" """
if (filename.lower().split('.')[-1] in ['pdf', 'txt', 'doc']): if (filename.lower().split('.')[-1] in ['pdf', 'txt', 'doc']):
return True return True
elif (filename.lower() in ('readme', 'license')): elif (filename.lower() in ('readme', 'license', 'usage', 'categories')):
return True return True
return False return False
# parse OPNsense external ACLs config def main():
if os.path.exists(acl_config_fn): # parse OPNsense external ACLs config
# create acl directory (if new) if os.path.exists(acl_config_fn):
if not os.path.exists(acl_target_dir): # create acl directory (if new)
os.mkdir(acl_target_dir) if not os.path.exists(acl_target_dir):
else: os.mkdir(acl_target_dir)
# remove index files else:
for filename in glob.glob('%s/*.index'%acl_target_dir): # remove index files
os.remove(filename) for filename in glob.glob('%s/*.index'%acl_target_dir):
# read config and download per section os.remove(filename)
cnf = ConfigParser() # read config and download per section
cnf.read(acl_config_fn) cnf = ConfigParser()
for section in cnf.sections(): cnf.read(acl_config_fn)
target_filename = acl_target_dir+'/'+section for section in cnf.sections():
if cnf.has_option(section,'url'): target_filename = acl_target_dir+'/'+section
# collect filters to apply if cnf.has_option(section,'url'):
acl_filters = list() # collect filters to apply
if cnf.has_option(section,'filter'): acl_filters = list()
for acl_filter in cnf.get(section,'filter').strip().split(','): if cnf.has_option(section,'filter'):
if len(acl_filter.strip()) > 0: for acl_filter in cnf.get(section,'filter').strip().split(','):
acl_filters.append(acl_filter) if len(acl_filter.strip()) > 0:
acl_filters.append(acl_filter)
# define targets
targets = {'domain': {'filename': target_filename, 'handle' : None}, # define target(s)
'url': {'filename': '%s.url'%target_filename, 'handle': None}} targets = {'domain': {'filename': target_filename, 'handle' : None, 'class': DomainSorter}}
# only generate files if enabled, otherwise dump empty files
if cnf.has_option(section,'enabled') and cnf.get(section,'enabled') == '1':
download_url = cnf.get(section,'url')
acl = ACLDownload(download_url, acl_max_timeout)
all_filenames = list()
for filename, filetype, line in acl.download():
if filename_in_ignorelist(os.path.basename(filename)):
# ignore documents, licenses and readme's
continue
if filename not in all_filenames: # only generate files if enabled, otherwise dump empty files
all_filenames.append(filename) if cnf.has_option(section,'enabled') and cnf.get(section,'enabled') == '1':
download_url = cnf.get(section,'url')
if len(acl_filters) > 0: acl = Downloader(download_url, acl_max_timeout)
acl_found = False all_filenames = list()
for acl_filter in acl_filters: for filename, line in acl.download():
if filename.find(acl_filter) > -1: if filename_in_ignorelist(os.path.basename(filename)):
acl_found = True # ignore documents, licenses and readme's
break
if not acl_found:
# skip this acl entry
continue continue
if filetype in targets and targets[filetype]['handle'] is None: # detect output type
targets[filetype]['handle'] = open(targets[filetype]['filename'], 'wb') if '/' in line or '|' in line:
if filetype in targets: file_type = 'url'
targets[filetype]['handle'].write('%s\n'%line) else:
# save index to disc file_type = 'domain'
with open('%s.index'%target_filename,'wb') as idx_out:
index_data = dict() if filename not in all_filenames:
for filename in all_filenames: all_filenames.append(filename)
if len(filename.split('/')) > 2:
index_key = '/'.join(filename.split('/')[1:-1]) if len(acl_filters) > 0:
if index_key not in index_data: acl_found = False
index_data[index_key] = index_key for acl_filter in acl_filters:
idx_out.write(json.dumps(index_data)) if acl_filter in filename:
acl_found = True
# cleanup break
for filetype in targets: if not acl_found:
if targets[filetype]['handle'] is not None: # skip this acl entry
targets[filetype]['handle'].close() continue
elif cnf.has_option(section,'enabled') and cnf.get(section,'enabled') != '1':
if os.path.isfile(targets[filetype]['filename']): if filetype in targets and targets[filetype]['handle'] is None:
# disabled, remove previous data targets[filetype]['handle'] = targets[filetype]['class'](targets[filetype]['filename'], 'wb')
os.remove(targets[filetype]['filename']) if filetype in targets:
elif not os.path.isfile(targets[filetype]['filename']): targets[filetype]['handle'].write('%s\n'%line)
# no data fetched and no file available, create new empty file # save index to disc
with open(targets[filetype]['filename'], 'wb') as target_out: with open('%s.index'%target_filename,'wb') as idx_out:
target_out.write("") index_data = dict()
for filename in all_filenames:
if len(filename.split('/')) > 2:
index_key = '/'.join(filename.split('/')[1:-1])
if index_key not in index_data:
index_data[index_key] = index_key
idx_out.write(json.dumps(index_data))
# cleanup
for filetype in targets:
if targets[filetype]['handle'] is not None:
targets[filetype]['handle'].close()
elif cnf.has_option(section,'enabled') and cnf.get(section,'enabled') != '1':
if os.path.isfile(targets[filetype]['filename']):
# disabled, remove previous data
os.remove(targets[filetype]['filename'])
elif not os.path.isfile(targets[filetype]['filename']):
# no data fetched and no file available, create new empty file
with open(targets[filetype]['filename'], 'wb') as target_out:
target_out.write("")
# execute downloader
main()
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment