(proxy) improve remote acl handling, sort and structure output for squid.

a03cc14f · Ad Schellevis · 78c3d912 · a03cc14f
Commit a03cc14f authored Mar 01, 2016 by Ad Schellevis
Hide whitespace changes
Inline Side-by-side

Showing with 216 additions and 94 deletions

fetchACLs.py src/opnsense/scripts/proxy/fetchACLs.py +216 -94

No files found.
--- a/src/opnsense/scripts/proxy/fetchACLs.py
+++ b/src/opnsense/scripts/proxy/fetchACLs.py
 #!/usr/local/bin/python2.7

 """
+    Copyright (c) 2016 Ad Schellevis - Deciso B.V.
    Copyright (c) 2015 Jos Schellevis - Deciso B.V.
    All rights reserved.

@@ -26,6 +27,7 @@
    POSSIBILITY OF SUCH DAMAGE.
 """

+import tempfile
 import urllib2
 import os
 import json
@@ -42,54 +44,65 @@ acl_config_fn = ('/usr/local/etc/squid/externalACLs.conf')
 acl_target_dir = ('/usr/local/etc/squid/acl')
 acl_max_timeout = 30

-class ACLDownload(object):
-
+class Downloader(object):
+    """ Download helper
+    """
    def __init__(self, url, timeout):
        """ init new
+            :param url: source url
+            :param timeout: timeout in seconds
        """
        self._url = url
        self._timeout = timeout
-        self._source_data = None
+        self._source_handle = None
        self._target_data = None

    def fetch(self):
-        """ fetch (raw) source data into self._source_data
+        """ fetch (raw) source data into tempfile using self._source_handle
        """
        try:
-            f = urllib2.urlopen(self._url,timeout = self._timeout)
-            self._source_data = f.read()
+            f = urllib2.urlopen(self._url, timeout = self._timeout)
+            # flush to temp file
+            self._source_handle = tempfile.NamedTemporaryFile()
+            while True:
+                data = f.read(1024)
+                if not data:
+                    break
+                else:
+                    self._source_handle.write(data)
+            self._source_handle.seek(0)
            f.close()
        except (urllib2.URLError, urllib2.HTTPError, IOError) as e:
            syslog.syslog(syslog.LOG_ERR, 'proxy acl: error downloading %s'%self._url)
-            self._source_data = None
+            self._source_handle = None

    def get_files(self):
        """ process downloaded data, handle compression
-            :return: iterator filename, content
+            :return: iterator filename, file handle
        """
-        if self._source_data is not None:
+        if self._source_handle is not None:
            # handle compressed data
            if (len(self._url) > 8 and self._url[-7:] == '.tar.gz') \
                or (len(self._url) > 4 and self._url[-4:] == '.tgz'):
                # source is in tar.gz format, extract all into a single string
                try:
-                    tf = tarfile.open(fileobj=StringIO.StringIO(self._source_data))
+                    tf = tarfile.open(fileobj=self._source_handle)
                    for tf_file in tf.getmembers():
                        if tf_file.isfile():
-                            yield tf_file.name, tf.extractfile(tf_file).read()
+                            yield tf_file.name, tf.extractfile(tf_file)
                except IOError as e:
                    syslog.syslog(syslog.LOG_ERR, 'proxy acl: error downloading %s (%s)'%(self._url, e))
            elif len(self._url) > 4 and self._url[-3:] == '.gz':
                # source is in .gz format unpack
                try:
-                    gf = gzip.GzipFile(mode='r', fileobj=StringIO.StringIO(self._source_data))
-                    yield os.path.basename(self._url), gf.read()
+                    gf = gzip.GzipFile(mode='r', fileobj=self._source_handle)
+                    yield os.path.basename(self._url), gf
                except IOError as e:
                    syslog.syslog(syslog.LOG_ERR, 'proxy acl: error downloading %s (%s)'%(self._url, e))
            elif len(self._url) > 5 and self._url[-4:] == '.zip':
                # source is in .zip format, extract all into a single string
                target_data = dict()
-                with zipfile.ZipFile(StringIO.StringIO(self._source_data),
+                with zipfile.ZipFile(self._source_handle,
                                     mode='r',
                                     compression=zipfile.ZIP_DEFLATED) as zf:
                    for item in zf.infolist():
@@ -97,20 +110,121 @@ class ACLDownload(object):
                            yield item.filename, zf.read(item)
                    self._target_data = target_data
            else:
-                yield os.path.basename(self._url), self._source_data
+                yield os.path.basename(self._url), self._source_handle

    def download(self):
        """ download / unpack ACL
            :return: iterator filename, type, content
        """
        self.fetch()
-        for filename, filedata in self.get_files():
-            for line in filedata.split('\n'):
-                if line.find('/') > -1:
-                    file_type = 'url'
+        for filename, filehandle in self.get_files():
+            while True:
+                line = filehandle.readline()
+                if not line:
+                    break
+                yield filename, line
+
+
+class DomainSorter(object):
+    """ Helper class for building sorted squid domain acl list.
+        Use as file type object, close flushes the actual (sorted) data to disc
+    """
+    def __init__(self, filename=None, mode=None):
+        """ new sorted output file, uses an acl record in reverse order as sort key
+            :param filename: target filename
+            :param mode: file open mode
+        """
+        self._num_targets = 20
+        self._seperator = '|'
+        self._buckets = dict()
+        self._sort_map = dict()
+        # setup target
+        self._target_filename = filename
+        self._target_mode = mode
+        # setup temp files
+        self.generate_targets()
+
+    def generate_targets(self):
+        """ generate ordered targets
+        """
+        sets = 255
+        for i in range(sets):
+            target = chr(i+1)
+            setid =  int(i / (sets / self._num_targets))
+            if setid not in self._buckets:
+                self._buckets[setid] = tempfile.NamedTemporaryFile()
+            self._sort_map[target] = self._buckets[setid]
+
+    def write(self, data):
+        """ save content, send reverse sorted to buffers
+            :param data: line to write
+        """
+        line = data.strip()
+        if len(line) > 0:
+            self.add(line[::-1], line)
+
+    def add(self, key, value):
+        """ spool data to temp
+            :param key: key to use
+            :param value: value to store
+        """
+        target = key[0]
+        if target in self._sort_map:
+            self._sort_map[target].write('%s%s%s\n'%(key, self._seperator, value))
+        else:
+            # not supposed to happen, every key should have a calculated target pool
+            pass
+
+    def reader(self):
+        """ read reverse
+        """
+        for target in sorted(self._buckets):
+            self._buckets[target].seek(0)
+            set_content = dict()
+            while True:
+                line = self._buckets[target].readline()
+                if not line:
+                    break
                else:
-                    file_type = 'domain'
-                yield filename, file_type, line
+                    set_content[line.split('|')[0]] = '|'.join(line.split('|')[1:])
+            for itemkey in sorted(set_content, reverse=True):
+                yield set_content[itemkey]
+
+    @staticmethod
+    def is_domain(tag):
+        """ check if tag is probably a domain name
+            :param tag: tag to inspect
+            :return: boolean
+        """
+        has_chars = False
+        for tag_item in tag:
+            if not tag_item.isdigit() and tag_item not in ('.', ',', '|', '/', '\n'):
+                has_chars = True
+            elif tag_item in (':', '|', '/'):
+                return False
+        if has_chars:
+            return True
+        else:
+            return False
+
+    def close(self):
+        """ close and dump content
+        """
+        if self._target_filename is not None and self._target_mode is not None:
+            # flush to file on close
+            with open(self._target_filename, self._target_mode) as f_out:
+                prev_line = None
+                for line in self.reader():
+                    line = line.lstrip('.')
+                    if prev_line == line:
+                        # duplicate, skip
+                        continue
+                    if self.is_domain(line):
+                        # prefix domain, but only if the chances are very small it will overlap
+                        if prev_line is None or line not in prev_line:
+                            f_out.write('.')
+                    f_out.write(line)
+                    prev_line = line


 def filename_in_ignorelist(filename):
@@ -119,82 +233,90 @@ def filename_in_ignorelist(filename):
    """
    if (filename.lower().split('.')[-1] in ['pdf', 'txt', 'doc']):
        return True
-    elif (filename.lower() in ('readme', 'license')):
+    elif (filename.lower() in ('readme', 'license', 'usage', 'categories')):
        return True
    return False

-# parse OPNsense external ACLs config
-if os.path.exists(acl_config_fn):
-    # create acl directory (if new)
-    if not os.path.exists(acl_target_dir):
-        os.mkdir(acl_target_dir)
-    else:
-        # remove index files
-        for filename in glob.glob('%s/*.index'%acl_target_dir):
-            os.remove(filename)
-    # read config and download per section
-    cnf = ConfigParser()
-    cnf.read(acl_config_fn)
-    for section in cnf.sections():
-        target_filename = acl_target_dir+'/'+section
-        if cnf.has_option(section,'url'):
-            # collect filters to apply
-            acl_filters = list()
-            if cnf.has_option(section,'filter'):
-                for acl_filter in cnf.get(section,'filter').strip().split(','):
-                    if len(acl_filter.strip()) > 0:
-                        acl_filters.append(acl_filter)
-
-            # define targets
-            targets = {'domain': {'filename': target_filename, 'handle' : None},
-                       'url': {'filename': '%s.url'%target_filename, 'handle': None}}
-
-            # only generate files if enabled, otherwise dump empty files
-            if cnf.has_option(section,'enabled') and cnf.get(section,'enabled') == '1':
-                download_url = cnf.get(section,'url')
-                acl = ACLDownload(download_url, acl_max_timeout)
-                all_filenames = list()
-                for filename, filetype, line in acl.download():
-                    if filename_in_ignorelist(os.path.basename(filename)):
-                        # ignore documents, licenses and readme's
-                        continue
+def main():
+    # parse OPNsense external ACLs config
+    if os.path.exists(acl_config_fn):
+        # create acl directory (if new)
+        if not os.path.exists(acl_target_dir):
+            os.mkdir(acl_target_dir)
+        else:
+            # remove index files
+            for filename in glob.glob('%s/*.index'%acl_target_dir):
+                os.remove(filename)
+        # read config and download per section
+        cnf = ConfigParser()
+        cnf.read(acl_config_fn)
+        for section in cnf.sections():
+            target_filename = acl_target_dir+'/'+section
+            if cnf.has_option(section,'url'):
+                # collect filters to apply
+                acl_filters = list()
+                if cnf.has_option(section,'filter'):
+                    for acl_filter in cnf.get(section,'filter').strip().split(','):
+                        if len(acl_filter.strip()) > 0:
+                            acl_filters.append(acl_filter)
+
+                # define target(s)
+                targets = {'domain': {'filename': target_filename, 'handle' : None, 'class': DomainSorter}}

-                    if filename not in all_filenames:
-                        all_filenames.append(filename)
-
-                    if len(acl_filters) > 0:
-                        acl_found = False
-                        for acl_filter in acl_filters:
-                            if filename.find(acl_filter) > -1:
-                                acl_found = True
-                                break
-                        if not acl_found:
-                            # skip this acl entry
+                # only generate files if enabled, otherwise dump empty files
+                if cnf.has_option(section,'enabled') and cnf.get(section,'enabled') == '1':
+                    download_url = cnf.get(section,'url')
+                    acl = Downloader(download_url, acl_max_timeout)
+                    all_filenames = list()
+                    for filename, line in acl.download():
+                        if filename_in_ignorelist(os.path.basename(filename)):
+                            # ignore documents, licenses and readme's
                            continue

-                    if filetype in targets and targets[filetype]['handle'] is None:
-                        targets[filetype]['handle'] = open(targets[filetype]['filename'], 'wb')
-                    if filetype in targets:
-                        targets[filetype]['handle'].write('%s\n'%line)
-                # save index to disc
-                with open('%s.index'%target_filename,'wb') as idx_out:
-                    index_data = dict()
-                    for filename in all_filenames:
-                        if len(filename.split('/')) > 2:
-                            index_key = '/'.join(filename.split('/')[1:-1])
-                            if index_key not in index_data:
-                                index_data[index_key] = index_key
-                    idx_out.write(json.dumps(index_data))
-
-            # cleanup
-            for filetype in targets:
-                if targets[filetype]['handle'] is not None:
-                    targets[filetype]['handle'].close()
-                elif cnf.has_option(section,'enabled') and cnf.get(section,'enabled') != '1':
-                    if os.path.isfile(targets[filetype]['filename']):
-                        # disabled, remove previous data
-                        os.remove(targets[filetype]['filename'])
-                elif not os.path.isfile(targets[filetype]['filename']):
-                    # no data fetched and no file available, create new empty file
-                    with open(targets[filetype]['filename'], 'wb') as target_out:
-                        target_out.write("")
+                        # detect output type
+                        if '/' in line or '|' in line:
+                            file_type = 'url'
+                        else:
+                            file_type = 'domain'
+
+                        if filename not in all_filenames:
+                            all_filenames.append(filename)
+
+                        if len(acl_filters) > 0:
+                            acl_found = False
+                            for acl_filter in acl_filters:
+                                if acl_filter in filename:
+                                    acl_found = True
+                                    break
+                            if not acl_found:
+                                # skip this acl entry
+                                continue
+
+                        if filetype in targets and targets[filetype]['handle'] is None:
+                            targets[filetype]['handle'] = targets[filetype]['class'](targets[filetype]['filename'], 'wb')
+                        if filetype in targets:
+                            targets[filetype]['handle'].write('%s\n'%line)
+                    # save index to disc
+                    with open('%s.index'%target_filename,'wb') as idx_out:
+                        index_data = dict()
+                        for filename in all_filenames:
+                            if len(filename.split('/')) > 2:
+                                index_key = '/'.join(filename.split('/')[1:-1])
+                                if index_key not in index_data:
+                                    index_data[index_key] = index_key
+                        idx_out.write(json.dumps(index_data))
+
+                # cleanup
+                for filetype in targets:
+                    if targets[filetype]['handle'] is not None:
+                        targets[filetype]['handle'].close()
+                    elif cnf.has_option(section,'enabled') and cnf.get(section,'enabled') != '1':
+                        if os.path.isfile(targets[filetype]['filename']):
+                            # disabled, remove previous data
+                            os.remove(targets[filetype]['filename'])
+                    elif not os.path.isfile(targets[filetype]['filename']):
+                        # no data fetched and no file available, create new empty file
+                        with open(targets[filetype]['filename'], 'wb') as target_out:
+                            target_out.write("")
+# execute downloader
+main()