(proxy) optimize acl download, split domains/urls and add index file

932e2d69 · Ad Schellevis · 2a1afe9e · 932e2d69
Commit 932e2d69 authored Feb 29, 2016 by Ad Schellevis
Hide whitespace changes
Inline Side-by-side

Showing with 63 additions and 42 deletions

fetchACLs.py src/opnsense/scripts/proxy/fetchACLs.py +63 -42

No files found.
--- a/src/opnsense/scripts/proxy/fetchACLs.py
+++ b/src/opnsense/scripts/proxy/fetchACLs.py
@@ -28,6 +28,8 @@

 import urllib2
 import os
+import json
+import glob
 import os.path
 import tarfile
 import gzip
@@ -61,8 +63,9 @@ class ACLDownload(object):
            syslog.syslog(syslog.LOG_ERR, 'proxy acl: error downloading %s'%self._url)
            self._source_data = None

-    def pre_process(self):
-        """ pre process downloaded data, handle compression
+    def get_files(self):
+        """ process downloaded data, handle compression
+            :return: iterator filename, content
        """
        if self._source_data is not None:
            # handle compressed data
@@ -71,49 +74,43 @@ class ACLDownload(object):
                # source is in tar.gz format, extract all into a single string
                try:
                    tf = tarfile.open(fileobj=StringIO.StringIO(self._source_data))
-                    target_data = []
                    for tf_file in tf.getmembers():
                        if tf_file.isfile():
-                            target_data.append(tf.extractfile(tf_file).read())
-                    self._target_data = ''.join(target_data)
+                            yield tf_file.name, tf.extractfile(tf_file).read()
                except IOError as e:
                    syslog.syslog(syslog.LOG_ERR, 'proxy acl: error downloading %s (%s)'%(self._url, e))
            elif len(self._url) > 4 and self._url[-3:] == '.gz':
                # source is in .gz format unpack
                try:
                    gf = gzip.GzipFile(mode='r', fileobj=StringIO.StringIO(self._source_data))
-                    self._target_data = gf.read()
+                    yield os.path.basename(self._url), gf.read()
                except IOError as e:
                    syslog.syslog(syslog.LOG_ERR, 'proxy acl: error downloading %s (%s)'%(self._url, e))
            elif len(self._url) > 5 and self._url[-4:] == '.zip':
                # source is in .zip format, extract all into a single string
-                target_data = []
+                target_data = dict()
                with zipfile.ZipFile(StringIO.StringIO(self._source_data),
                                     mode='r',
                                     compression=zipfile.ZIP_DEFLATED) as zf:
                    for item in zf.infolist():
-                        target_data.append(zf.read(item))
-                    self._target_data = ''.join(target_data)
+                        if item.file_size > 0:
+                            yield item.filename, zf.read(item)
+                    self._target_data = target_data
            else:
-                self._target_data = self._source_data
+                yield os.path.basename(self._url), self._source_data

    def download(self):
-        self.fetch()
-        self.pre_process()
-
-    def is_valid(self):
-        """ did this ACL download successful
-        """
-        if self._target_data is not None:
-            return True
-        else:
-            return False
-
-    def get_data(self):
-        """ retrieve data
+        """ download / unpack ACL
+            :return: iterator filename, type, content
        """
-        # XXX: maybe some postprocessing is needed here, all will be used with a squid dstdom_regex tag
-        return self._target_data
+        self.fetch()
+        for filename, filedata in self.get_files():
+            for line in filedata.split('\n'):
+                if line.find('/') > -1:
+                    file_type = 'url'
+                else:
+                    file_type = 'domain'
+                yield filename, file_type, line


 # parse OPNsense external ACLs config
@@ -121,6 +118,10 @@ if os.path.exists(acl_config_fn):
    # create acl directory (if new)
    if not os.path.exists(acl_target_dir):
        os.mkdir(acl_target_dir)
+    else:
+        # remove index files
+        for filename in glob.glob('%s/*.index'%acl_target_dir):
+            os.remove(filename)
    # read config and download per section
    cnf = ConfigParser()
    cnf.read(acl_config_fn)
@@ -129,22 +130,42 @@ if os.path.exists(acl_config_fn):
        if cnf.has_option(section,'enabled'):
            # if enabled fetch file
            target_filename = acl_target_dir+'/'+section
-            if cnf.get(section,'enabled')=='1':
-                if cnf.has_option(section,'url'):
+            if cnf.has_option(section,'url'):
+                # define targets
+                targets = {'domain': {'filename': target_filename, 'handle' : None},
+                           'url': {'filename': '%s.url'%target_filename, 'handle': None}}
+
+                # download file
+                if cnf.get(section,'enabled') == '1':
+                    # only generate files if enabled, otherwise dump empty files
                    download_url = cnf.get(section,'url')
                    acl = ACLDownload(download_url, acl_max_timeout)
-                    acl.download()
-                    if acl.is_valid():
-                        output_data = acl.get_data()
-                        with open(target_filename, "wb") as code:
-                            code.write(output_data)
-                    elif not os.path.isfile(target_filename):
-                        # if there's no file available, create an empty one (otherwise leave the last download).
-                        with open(target_filename, "wb") as code:
-                            code.write("")
-            # if disabled or not 1 try to remove old file
-            elif cnf.get(section,'enabled')!='1':
-                try:
-                    os.remove(acl_target_dir+'/'+section)
-                except OSError:
-                    pass
+                    all_filenames = list()
+                    for filename, filetype, line in acl.download():
+                        if filename not in all_filenames:
+                            all_filenames.append(filename)
+                        if filetype in targets and targets[filetype]['handle'] is None:
+                            targets[filetype]['handle'] = open(targets[filetype]['filename'], 'wb')
+                        if filetype in targets:
+                            targets[filetype]['handle'].write('%s\n'%line)
+                    # save index to disc
+                    with open('%s.index'%target_filename,'wb') as idx_out:
+                        index_data = dict()
+                        for filename in all_filenames:
+                            if len(filename.split('/')) > 3:
+                                index_key = '/'.join(filename.split('/')[1:-1])
+                                if index_key not in index_data:
+                                    index_data[index_key] = index_key
+                        idx_out.write(json.dumps(index_data))
+                # cleanup
+                for filetype in targets:
+                    if targets[filetype]['handle'] is not None:
+                        targets[filetype]['handle'].close()
+                    elif cnf.get(section,'enabled') != '1':
+                        if os.path.isfile(targets[filetype]['filename']):
+                            # disabled, remove previous data
+                            os.remove(targets[filetype]['filename'])
+                    elif not os.path.isfile(targets[filetype]['filename']):
+                        # no data fetched and no file available, create new empty file
+                        with open(targets[filetype]['filename'], 'wb') as target_out:
+                            target_out.write("")