Commit a03cc14f authored by Ad Schellevis's avatar Ad Schellevis

(proxy) improve remote acl handling, sort and structure output for squid.

parent 78c3d912
#!/usr/local/bin/python2.7
"""
Copyright (c) 2016 Ad Schellevis - Deciso B.V.
Copyright (c) 2015 Jos Schellevis - Deciso B.V.
All rights reserved.
......@@ -26,6 +27,7 @@
POSSIBILITY OF SUCH DAMAGE.
"""
import tempfile
import urllib2
import os
import json
......@@ -42,54 +44,65 @@ acl_config_fn = ('/usr/local/etc/squid/externalACLs.conf')
acl_target_dir = ('/usr/local/etc/squid/acl')
acl_max_timeout = 30
class ACLDownload(object):
class Downloader(object):
""" Download helper
"""
def __init__(self, url, timeout):
""" init new
:param url: source url
:param timeout: timeout in seconds
"""
self._url = url
self._timeout = timeout
self._source_data = None
self._source_handle = None
self._target_data = None
def fetch(self):
""" fetch (raw) source data into self._source_data
""" fetch (raw) source data into tempfile using self._source_handle
"""
try:
f = urllib2.urlopen(self._url,timeout = self._timeout)
self._source_data = f.read()
f = urllib2.urlopen(self._url, timeout = self._timeout)
# flush to temp file
self._source_handle = tempfile.NamedTemporaryFile()
while True:
data = f.read(1024)
if not data:
break
else:
self._source_handle.write(data)
self._source_handle.seek(0)
f.close()
except (urllib2.URLError, urllib2.HTTPError, IOError) as e:
syslog.syslog(syslog.LOG_ERR, 'proxy acl: error downloading %s'%self._url)
self._source_data = None
self._source_handle = None
def get_files(self):
""" process downloaded data, handle compression
:return: iterator filename, content
:return: iterator filename, file handle
"""
if self._source_data is not None:
if self._source_handle is not None:
# handle compressed data
if (len(self._url) > 8 and self._url[-7:] == '.tar.gz') \
or (len(self._url) > 4 and self._url[-4:] == '.tgz'):
# source is in tar.gz format, extract all into a single string
try:
tf = tarfile.open(fileobj=StringIO.StringIO(self._source_data))
tf = tarfile.open(fileobj=self._source_handle)
for tf_file in tf.getmembers():
if tf_file.isfile():
yield tf_file.name, tf.extractfile(tf_file).read()
yield tf_file.name, tf.extractfile(tf_file)
except IOError as e:
syslog.syslog(syslog.LOG_ERR, 'proxy acl: error downloading %s (%s)'%(self._url, e))
elif len(self._url) > 4 and self._url[-3:] == '.gz':
# source is in .gz format unpack
try:
gf = gzip.GzipFile(mode='r', fileobj=StringIO.StringIO(self._source_data))
yield os.path.basename(self._url), gf.read()
gf = gzip.GzipFile(mode='r', fileobj=self._source_handle)
yield os.path.basename(self._url), gf
except IOError as e:
syslog.syslog(syslog.LOG_ERR, 'proxy acl: error downloading %s (%s)'%(self._url, e))
elif len(self._url) > 5 and self._url[-4:] == '.zip':
# source is in .zip format, extract all into a single string
target_data = dict()
with zipfile.ZipFile(StringIO.StringIO(self._source_data),
with zipfile.ZipFile(self._source_handle,
mode='r',
compression=zipfile.ZIP_DEFLATED) as zf:
for item in zf.infolist():
......@@ -97,20 +110,121 @@ class ACLDownload(object):
yield item.filename, zf.read(item)
self._target_data = target_data
else:
yield os.path.basename(self._url), self._source_data
yield os.path.basename(self._url), self._source_handle
def download(self):
""" download / unpack ACL
:return: iterator filename, type, content
"""
self.fetch()
for filename, filedata in self.get_files():
for line in filedata.split('\n'):
if line.find('/') > -1:
file_type = 'url'
for filename, filehandle in self.get_files():
while True:
line = filehandle.readline()
if not line:
break
yield filename, line
class DomainSorter(object):
""" Helper class for building sorted squid domain acl list.
Use as file type object, close flushes the actual (sorted) data to disc
"""
def __init__(self, filename=None, mode=None):
""" new sorted output file, uses an acl record in reverse order as sort key
:param filename: target filename
:param mode: file open mode
"""
self._num_targets = 20
self._seperator = '|'
self._buckets = dict()
self._sort_map = dict()
# setup target
self._target_filename = filename
self._target_mode = mode
# setup temp files
self.generate_targets()
def generate_targets(self):
""" generate ordered targets
"""
sets = 255
for i in range(sets):
target = chr(i+1)
setid = int(i / (sets / self._num_targets))
if setid not in self._buckets:
self._buckets[setid] = tempfile.NamedTemporaryFile()
self._sort_map[target] = self._buckets[setid]
def write(self, data):
""" save content, send reverse sorted to buffers
:param data: line to write
"""
line = data.strip()
if len(line) > 0:
self.add(line[::-1], line)
def add(self, key, value):
""" spool data to temp
:param key: key to use
:param value: value to store
"""
target = key[0]
if target in self._sort_map:
self._sort_map[target].write('%s%s%s\n'%(key, self._seperator, value))
else:
# not supposed to happen, every key should have a calculated target pool
pass
def reader(self):
""" read reverse
"""
for target in sorted(self._buckets):
self._buckets[target].seek(0)
set_content = dict()
while True:
line = self._buckets[target].readline()
if not line:
break
else:
file_type = 'domain'
yield filename, file_type, line
set_content[line.split('|')[0]] = '|'.join(line.split('|')[1:])
for itemkey in sorted(set_content, reverse=True):
yield set_content[itemkey]
@staticmethod
def is_domain(tag):
""" check if tag is probably a domain name
:param tag: tag to inspect
:return: boolean
"""
has_chars = False
for tag_item in tag:
if not tag_item.isdigit() and tag_item not in ('.', ',', '|', '/', '\n'):
has_chars = True
elif tag_item in (':', '|', '/'):
return False
if has_chars:
return True
else:
return False
def close(self):
""" close and dump content
"""
if self._target_filename is not None and self._target_mode is not None:
# flush to file on close
with open(self._target_filename, self._target_mode) as f_out:
prev_line = None
for line in self.reader():
line = line.lstrip('.')
if prev_line == line:
# duplicate, skip
continue
if self.is_domain(line):
# prefix domain, but only if the chances are very small it will overlap
if prev_line is None or line not in prev_line:
f_out.write('.')
f_out.write(line)
prev_line = line
def filename_in_ignorelist(filename):
......@@ -119,82 +233,90 @@ def filename_in_ignorelist(filename):
"""
if (filename.lower().split('.')[-1] in ['pdf', 'txt', 'doc']):
return True
elif (filename.lower() in ('readme', 'license')):
elif (filename.lower() in ('readme', 'license', 'usage', 'categories')):
return True
return False
# parse OPNsense external ACLs config
if os.path.exists(acl_config_fn):
# create acl directory (if new)
if not os.path.exists(acl_target_dir):
os.mkdir(acl_target_dir)
else:
# remove index files
for filename in glob.glob('%s/*.index'%acl_target_dir):
os.remove(filename)
# read config and download per section
cnf = ConfigParser()
cnf.read(acl_config_fn)
for section in cnf.sections():
target_filename = acl_target_dir+'/'+section
if cnf.has_option(section,'url'):
# collect filters to apply
acl_filters = list()
if cnf.has_option(section,'filter'):
for acl_filter in cnf.get(section,'filter').strip().split(','):
if len(acl_filter.strip()) > 0:
acl_filters.append(acl_filter)
# define targets
targets = {'domain': {'filename': target_filename, 'handle' : None},
'url': {'filename': '%s.url'%target_filename, 'handle': None}}
# only generate files if enabled, otherwise dump empty files
if cnf.has_option(section,'enabled') and cnf.get(section,'enabled') == '1':
download_url = cnf.get(section,'url')
acl = ACLDownload(download_url, acl_max_timeout)
all_filenames = list()
for filename, filetype, line in acl.download():
if filename_in_ignorelist(os.path.basename(filename)):
# ignore documents, licenses and readme's
continue
def main():
# parse OPNsense external ACLs config
if os.path.exists(acl_config_fn):
# create acl directory (if new)
if not os.path.exists(acl_target_dir):
os.mkdir(acl_target_dir)
else:
# remove index files
for filename in glob.glob('%s/*.index'%acl_target_dir):
os.remove(filename)
# read config and download per section
cnf = ConfigParser()
cnf.read(acl_config_fn)
for section in cnf.sections():
target_filename = acl_target_dir+'/'+section
if cnf.has_option(section,'url'):
# collect filters to apply
acl_filters = list()
if cnf.has_option(section,'filter'):
for acl_filter in cnf.get(section,'filter').strip().split(','):
if len(acl_filter.strip()) > 0:
acl_filters.append(acl_filter)
# define target(s)
targets = {'domain': {'filename': target_filename, 'handle' : None, 'class': DomainSorter}}
if filename not in all_filenames:
all_filenames.append(filename)
if len(acl_filters) > 0:
acl_found = False
for acl_filter in acl_filters:
if filename.find(acl_filter) > -1:
acl_found = True
break
if not acl_found:
# skip this acl entry
# only generate files if enabled, otherwise dump empty files
if cnf.has_option(section,'enabled') and cnf.get(section,'enabled') == '1':
download_url = cnf.get(section,'url')
acl = Downloader(download_url, acl_max_timeout)
all_filenames = list()
for filename, line in acl.download():
if filename_in_ignorelist(os.path.basename(filename)):
# ignore documents, licenses and readme's
continue
if filetype in targets and targets[filetype]['handle'] is None:
targets[filetype]['handle'] = open(targets[filetype]['filename'], 'wb')
if filetype in targets:
targets[filetype]['handle'].write('%s\n'%line)
# save index to disc
with open('%s.index'%target_filename,'wb') as idx_out:
index_data = dict()
for filename in all_filenames:
if len(filename.split('/')) > 2:
index_key = '/'.join(filename.split('/')[1:-1])
if index_key not in index_data:
index_data[index_key] = index_key
idx_out.write(json.dumps(index_data))
# cleanup
for filetype in targets:
if targets[filetype]['handle'] is not None:
targets[filetype]['handle'].close()
elif cnf.has_option(section,'enabled') and cnf.get(section,'enabled') != '1':
if os.path.isfile(targets[filetype]['filename']):
# disabled, remove previous data
os.remove(targets[filetype]['filename'])
elif not os.path.isfile(targets[filetype]['filename']):
# no data fetched and no file available, create new empty file
with open(targets[filetype]['filename'], 'wb') as target_out:
target_out.write("")
# detect output type
if '/' in line or '|' in line:
file_type = 'url'
else:
file_type = 'domain'
if filename not in all_filenames:
all_filenames.append(filename)
if len(acl_filters) > 0:
acl_found = False
for acl_filter in acl_filters:
if acl_filter in filename:
acl_found = True
break
if not acl_found:
# skip this acl entry
continue
if filetype in targets and targets[filetype]['handle'] is None:
targets[filetype]['handle'] = targets[filetype]['class'](targets[filetype]['filename'], 'wb')
if filetype in targets:
targets[filetype]['handle'].write('%s\n'%line)
# save index to disc
with open('%s.index'%target_filename,'wb') as idx_out:
index_data = dict()
for filename in all_filenames:
if len(filename.split('/')) > 2:
index_key = '/'.join(filename.split('/')[1:-1])
if index_key not in index_data:
index_data[index_key] = index_key
idx_out.write(json.dumps(index_data))
# cleanup
for filetype in targets:
if targets[filetype]['handle'] is not None:
targets[filetype]['handle'].close()
elif cnf.has_option(section,'enabled') and cnf.get(section,'enabled') != '1':
if os.path.isfile(targets[filetype]['filename']):
# disabled, remove previous data
os.remove(targets[filetype]['filename'])
elif not os.path.isfile(targets[filetype]['filename']):
# no data fetched and no file available, create new empty file
with open(targets[filetype]['filename'], 'wb') as target_out:
target_out.write("")
# execute downloader
main()
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment