(proxy) improve remote acl handling, sort and structure output for squid.

(cherry picked from commit a03cc14f)

(proxy) improve remote acl handling, sort and structure output for squid.
(cherry picked from commit a03cc14f)
d1701ff7 · Ad Schellevis · Franco Fichtner · 4393cdf8 · d1701ff7
Commit d1701ff7 authored Mar 01, 2016 by Ad Schellevis Committed by Franco Fichtner Mar 03, 2016
Hide whitespace changes
Inline Side-by-side

Showing with 216 additions and 94 deletions

fetchACLs.py src/opnsense/scripts/proxy/fetchACLs.py +216 -94

No files found.
--- a/src/opnsense/scripts/proxy/fetchACLs.py
+++ b/src/opnsense/scripts/proxy/fetchACLs.py
 #!/usr/local/bin/python2.7
 """
+    Copyright (c) 2016 Ad Schellevis - Deciso B.V.
    Copyright (c) 2015 Jos Schellevis - Deciso B.V.
    All rights reserved.
@@ -26,6 +27,7 @@
    POSSIBILITY OF SUCH DAMAGE.
 """
+import tempfile
 import urllib2
 import os
 import json
@@ -42,54 +44,65 @@ acl_config_fn = ('/usr/local/etc/squid/externalACLs.conf')
 acl_target_dir = ('/usr/local/etc/squid/acl')
 acl_max_timeout = 30
-class ACLDownload(object):
+class Downloader(object):
+    """ Download helper
+    """
    def __init__(self, url, timeout):
        """ init new
+            :param url: source url
+            :param timeout: timeout in seconds
        """
        self._url = url
        self._timeout = timeout
-        self._source_data = None
+        self._source_handle = None
        self._target_data = None
    def fetch(self):
-        """ fetch (raw) source data into self._source_data
+        """ fetch (raw) source data into tempfile using self._source_handle
        """
        try:
-            f = urllib2.urlopen(self._url,timeout = self._timeout)
+            f = urllib2.urlopen(self._url, timeout = self._timeout)
-            self._source_data = f.read()
+            # flush to temp file
+            self._source_handle = tempfile.NamedTemporaryFile()
+            while True:
+                data = f.read(1024)
+                if not data:
+                    break
+                else:
+                    self._source_handle.write(data)
+            self._source_handle.seek(0)
            f.close()
        except (urllib2.URLError, urllib2.HTTPError, IOError) as e:
            syslog.syslog(syslog.LOG_ERR, 'proxy acl: error downloading %s'%self._url)
-            self._source_data = None
+            self._source_handle = None
    def get_files(self):
        """ process downloaded data, handle compression
-            :return: iterator filename, content
+            :return: iterator filename, file handle
        """
-        if self._source_data is not None:
+        if self._source_handle is not None:
            # handle compressed data
            if (len(self._url) > 8 and self._url[-7:] == '.tar.gz') \
                or (len(self._url) > 4 and self._url[-4:] == '.tgz'):
                # source is in tar.gz format, extract all into a single string
                try:
-                    tf = tarfile.open(fileobj=StringIO.StringIO(self._source_data))
+                    tf = tarfile.open(fileobj=self._source_handle)
                    for tf_file in tf.getmembers():
                        if tf_file.isfile():
-                            yield tf_file.name, tf.extractfile(tf_file).read()
+                            yield tf_file.name, tf.extractfile(tf_file)
                except IOError as e:
                    syslog.syslog(syslog.LOG_ERR, 'proxy acl: error downloading %s (%s)'%(self._url, e))
            elif len(self._url) > 4 and self._url[-3:] == '.gz':
                # source is in .gz format unpack
                try:
-                    gf = gzip.GzipFile(mode='r', fileobj=StringIO.StringIO(self._source_data))
+                    gf = gzip.GzipFile(mode='r', fileobj=self._source_handle)
-                    yield os.path.basename(self._url), gf.read()
+                    yield os.path.basename(self._url), gf
                except IOError as e:
                    syslog.syslog(syslog.LOG_ERR, 'proxy acl: error downloading %s (%s)'%(self._url, e))
            elif len(self._url) > 5 and self._url[-4:] == '.zip':
                # source is in .zip format, extract all into a single string
                target_data = dict()
-                with zipfile.ZipFile(StringIO.StringIO(self._source_data),
+                with zipfile.ZipFile(self._source_handle,
                                     mode='r',
                                     compression=zipfile.ZIP_DEFLATED) as zf:
                    for item in zf.infolist():
@@ -97,20 +110,121 @@ class ACLDownload(object):
                            yield item.filename, zf.read(item)
                    self._target_data = target_data
            else:
-                yield os.path.basename(self._url), self._source_data
+                yield os.path.basename(self._url), self._source_handle
    def download(self):
        """ download / unpack ACL
            :return: iterator filename, type, content
        """
        self.fetch()
-        for filename, filedata in self.get_files():
+        for filename, filehandle in self.get_files():
-            for line in filedata.split('\n'):
+            while True:
-                if line.find('/') > -1:
+                line = filehandle.readline()
-                    file_type = 'url'
+                if not line:
+                    break
+                yield filename, line
+class DomainSorter(object):
+    """ Helper class for building sorted squid domain acl list.
+        Use as file type object, close flushes the actual (sorted) data to disc
+    """
+    def __init__(self, filename=None, mode=None):
+        """ new sorted output file, uses an acl record in reverse order as sort key
+            :param filename: target filename
+            :param mode: file open mode
+        """
+        self._num_targets = 20
+        self._seperator = '|'
+        self._buckets = dict()
+        self._sort_map = dict()
+        # setup target
+        self._target_filename = filename
+        self._target_mode = mode
+        # setup temp files
+        self.generate_targets()
+    def generate_targets(self):
+        """ generate ordered targets
+        """
+        sets = 255
+        for i in range(sets):
+            target = chr(i+1)
+            setid =  int(i / (sets / self._num_targets))
+            if setid not in self._buckets:
+                self._buckets[setid] = tempfile.NamedTemporaryFile()
+            self._sort_map[target] = self._buckets[setid]
+    def write(self, data):
+        """ save content, send reverse sorted to buffers
+            :param data: line to write
+        """
+        line = data.strip()
+        if len(line) > 0:
+            self.add(line[::-1], line)
+    def add(self, key, value):
+        """ spool data to temp
+            :param key: key to use
+            :param value: value to store
+        """
+        target = key[0]
+        if target in self._sort_map:
+            self._sort_map[target].write('%s%s%s\n'%(key, self._seperator, value))
+        else:
+            # not supposed to happen, every key should have a calculated target pool
+            pass
+    def reader(self):
+        """ read reverse
+        """
+        for target in sorted(self._buckets):
+            self._buckets[target].seek(0)
+            set_content = dict()
+            while True:
+                line = self._buckets[target].readline()
+                if not line:
+                    break
                else:
-                    file_type = 'domain'
+                    set_content[line.split('|')[0]] = '|'.join(line.split('|')[1:])
-                yield filename, file_type, line
+            for itemkey in sorted(set_content, reverse=True):
+                yield set_content[itemkey]
+    @staticmethod
+    def is_domain(tag):
+        """ check if tag is probably a domain name
+            :param tag: tag to inspect
+            :return: boolean
+        """
+        has_chars = False
+        for tag_item in tag:
+            if not tag_item.isdigit() and tag_item not in ('.', ',', '|', '/', '\n'):
+                has_chars = True
+            elif tag_item in (':', '|', '/'):
+                return False
+        if has_chars:
+            return True
+        else:
+            return False
+    def close(self):
+        """ close and dump content
+        """
+        if self._target_filename is not None and self._target_mode is not None:
+            # flush to file on close
+            with open(self._target_filename, self._target_mode) as f_out:
+                prev_line = None
+                for line in self.reader():
+                    line = line.lstrip('.')
+                    if prev_line == line:
+                        # duplicate, skip
+                        continue
+                    if self.is_domain(line):
+                        # prefix domain, but only if the chances are very small it will overlap
+                        if prev_line is None or line not in prev_line:
+                            f_out.write('.')
+                    f_out.write(line)
+                    prev_line = line
 def filename_in_ignorelist(filename):
@@ -119,82 +233,90 @@ def filename_in_ignorelist(filename):
    """
    if (filename.lower().split('.')[-1] in ['pdf', 'txt', 'doc']):
        return True
-    elif (filename.lower() in ('readme', 'license')):
+    elif (filename.lower() in ('readme', 'license', 'usage', 'categories')):
        return True
    return False
-# parse OPNsense external ACLs config
+def main():
-if os.path.exists(acl_config_fn):
+    # parse OPNsense external ACLs config
-    # create acl directory (if new)
+    if os.path.exists(acl_config_fn):
-    if not os.path.exists(acl_target_dir):
+        # create acl directory (if new)
-        os.mkdir(acl_target_dir)
+        if not os.path.exists(acl_target_dir):
-    else:
+            os.mkdir(acl_target_dir)
-        # remove index files
+        else:
-        for filename in glob.glob('%s/*.index'%acl_target_dir):
+            # remove index files
-            os.remove(filename)
+            for filename in glob.glob('%s/*.index'%acl_target_dir):
-    # read config and download per section
+                os.remove(filename)
-    cnf = ConfigParser()
+        # read config and download per section
-    cnf.read(acl_config_fn)
+        cnf = ConfigParser()
-    for section in cnf.sections():
+        cnf.read(acl_config_fn)
-        target_filename = acl_target_dir+'/'+section
+        for section in cnf.sections():
-        if cnf.has_option(section,'url'):
+            target_filename = acl_target_dir+'/'+section
-            # collect filters to apply
+            if cnf.has_option(section,'url'):
-            acl_filters = list()
+                # collect filters to apply
-            if cnf.has_option(section,'filter'):
+                acl_filters = list()
-                for acl_filter in cnf.get(section,'filter').strip().split(','):
+                if cnf.has_option(section,'filter'):
-                    if len(acl_filter.strip()) > 0:
+                    for acl_filter in cnf.get(section,'filter').strip().split(','):
-                        acl_filters.append(acl_filter)
+                        if len(acl_filter.strip()) > 0:
+                            acl_filters.append(acl_filter)
-            # define targets
-            targets = {'domain': {'filename': target_filename, 'handle' : None},
+                # define target(s)
-                       'url': {'filename': '%s.url'%target_filename, 'handle': None}}
+                targets = {'domain': {'filename': target_filename, 'handle' : None, 'class': DomainSorter}}
-            # only generate files if enabled, otherwise dump empty files
-            if cnf.has_option(section,'enabled') and cnf.get(section,'enabled') == '1':
-                download_url = cnf.get(section,'url')
-                acl = ACLDownload(download_url, acl_max_timeout)
-                all_filenames = list()
-                for filename, filetype, line in acl.download():
-                    if filename_in_ignorelist(os.path.basename(filename)):
-                        # ignore documents, licenses and readme's
-                        continue
-                    if filename not in all_filenames:
+                # only generate files if enabled, otherwise dump empty files
-                        all_filenames.append(filename)
+                if cnf.has_option(section,'enabled') and cnf.get(section,'enabled') == '1':
+                    download_url = cnf.get(section,'url')
-                    if len(acl_filters) > 0:
+                    acl = Downloader(download_url, acl_max_timeout)
-                        acl_found = False
+                    all_filenames = list()
-                        for acl_filter in acl_filters:
+                    for filename, line in acl.download():
-                            if filename.find(acl_filter) > -1:
+                        if filename_in_ignorelist(os.path.basename(filename)):
-                                acl_found = True
+                            # ignore documents, licenses and readme's
-                                break
-                        if not acl_found:
-                            # skip this acl entry
                            continue
-                    if filetype in targets and targets[filetype]['handle'] is None:
+                        # detect output type
-                        targets[filetype]['handle'] = open(targets[filetype]['filename'], 'wb')
+                        if '/' in line or '|' in line:
-                    if filetype in targets:
+                            file_type = 'url'
-                        targets[filetype]['handle'].write('%s\n'%line)
+                        else:
-                # save index to disc
+                            file_type = 'domain'
-                with open('%s.index'%target_filename,'wb') as idx_out:
-                    index_data = dict()
+                        if filename not in all_filenames:
-                    for filename in all_filenames:
+                            all_filenames.append(filename)
-                        if len(filename.split('/')) > 2:
-                            index_key = '/'.join(filename.split('/')[1:-1])
+                        if len(acl_filters) > 0:
-                            if index_key not in index_data:
+                            acl_found = False
-                                index_data[index_key] = index_key
+                            for acl_filter in acl_filters:
-                    idx_out.write(json.dumps(index_data))
+                                if acl_filter in filename:
+                                    acl_found = True
-            # cleanup
+                                    break
-            for filetype in targets:
+                            if not acl_found:
-                if targets[filetype]['handle'] is not None:
+                                # skip this acl entry
-                    targets[filetype]['handle'].close()
+                                continue
-                elif cnf.has_option(section,'enabled') and cnf.get(section,'enabled') != '1':
-                    if os.path.isfile(targets[filetype]['filename']):
+                        if filetype in targets and targets[filetype]['handle'] is None:
-                        # disabled, remove previous data
+                            targets[filetype]['handle'] = targets[filetype]['class'](targets[filetype]['filename'], 'wb')
-                        os.remove(targets[filetype]['filename'])
+                        if filetype in targets:
-                elif not os.path.isfile(targets[filetype]['filename']):
+                            targets[filetype]['handle'].write('%s\n'%line)
-                    # no data fetched and no file available, create new empty file
+                    # save index to disc
-                    with open(targets[filetype]['filename'], 'wb') as target_out:
+                    with open('%s.index'%target_filename,'wb') as idx_out:
-                        target_out.write("")
+                        index_data = dict()
+                        for filename in all_filenames:
+                            if len(filename.split('/')) > 2:
+                                index_key = '/'.join(filename.split('/')[1:-1])
+                                if index_key not in index_data:
+                                    index_data[index_key] = index_key
+                        idx_out.write(json.dumps(index_data))
+                # cleanup
+                for filetype in targets:
+                    if targets[filetype]['handle'] is not None:
+                        targets[filetype]['handle'].close()
+                    elif cnf.has_option(section,'enabled') and cnf.get(section,'enabled') != '1':
+                        if os.path.isfile(targets[filetype]['filename']):
+                            # disabled, remove previous data
+                            os.remove(targets[filetype]['filename'])
+                    elif not os.path.isfile(targets[filetype]['filename']):
+                        # no data fetched and no file available, create new empty file
+                        with open(targets[filetype]['filename'], 'wb') as target_out:
+                            target_out.write("")
+# execute downloader
+main()