Source code for bspump.elasticsearch.source

import logging

from ..abc.source import TriggerSource

L = logging.getLogger(__name__)


[docs]class ElasticSearchSource(TriggerSource):
    """
    Description:

    """
    ConfigDefaults = {
        'index': 'index-*',
        'scroll_timeout': '1m',

    }

[docs]    def __init__(self, app, pipeline, connection, request_body=None, paging=True, id=None, config=None):
        """

        **Parameters**

        app : Application
                Name of the `Application <https://asab.readthedocs.io/en/latest/asab/application.html>`_.

        pipeline : Pipeline
                Name of the Pipeline.

        connection : Connection
                Information of the connection.

        request_body JSON, default = None
                Request body needed for the request API call.

        paging : ?, default = True

        id : ID, default = None
                ID

        config : JSON/dict, default = None
                Configuration file with additional information.

        """
        super().__init__(app, pipeline, id=id, config=config)
        self.Connection = pipeline.locate_connection(app, connection)

        self.Index = self.Config['index']
        self.ScrollTimeout = self.Config['scroll_timeout']
        self.Paging = paging

        if request_body is not None:
            self.RequestBody = request_body
        else:
            self.RequestBody = {
                'query': {
                    'bool': {
                        'must': {
                            'match_all': {}
                        }
                    }
                }}

[docs]    async def cycle(self):
        """
        Gets data from Elastic and injects them into the pipeline.

        """
        scroll_id = None

        while True:
            if scroll_id is None:
                path = '{}/_search?scroll={}'.format(self.Index, self.ScrollTimeout)
                request_body = self.RequestBody
            else:
                path = "_search/scroll"
                request_body = {"scroll": self.ScrollTimeout, "scroll_id": scroll_id}

            url = self.Connection.get_url() + path
            async with self.Connection.get_session() as session:
                async with session.post(
                        url,
                        json=request_body,
                        headers={'Content-Type': 'application/json'}
                ) as response:

                    if response.status != 200:
                        data = await response.text()
                        L.error("Failed to fetch data from ElasticSearch: {} from {}\n{}".format(response.status, url, data))
                        break

                    msg = await response.json()

            scroll_id = msg.get('_scroll_id')
            if scroll_id is None:
                break

            hits = msg['hits']['hits']
            if len(hits) == 0:
                break

            # Feed messages into a pipeline
            for hit in hits:
                await self.process(hit['_source'])

            if not self.Paging:
                break


[docs]class ElasticSearchAggsSource(TriggerSource):
    """
    Description:

    """
    ConfigDefaults = {
        'index': 'index-*',
    }

[docs]    def __init__(self, app, pipeline, connection, request_body=None, id=None, config=None):
        """
        Description:

        **Parameters**

        app : Application
                Name of the `Application <https://asab.readthedocs.io/en/latest/asab/application.html>`_.

        pipeline : Pipeline
                Name of the Pipeline.

        connection : Connection
                Information of the connection.

        request_body JSON, default = None
                Request body needed for the request API call.

        id : ID, default = None
                ID info

        config : JSON/dict, default = None
                configuration file with additional information.

        """
        super().__init__(app, pipeline, id=id, config=config)
        self.Connection = pipeline.locate_connection(app, connection)

        self.Index = self.Config['index']

        if request_body is not None:
            self.RequestBody = request_body
        else:
            self.RequestBody = {
                'query': {
                    'bool': {
                        'must': {
                            'match_all': {}
                        }
                    }
                }
            }

[docs]    async def cycle(self):
        """
        Sets request body and path to create query call.

        |

        """
        request_body = self.RequestBody
        path = '{}/_search?'.format(self.Index)
        url = self.Connection.get_url() + path
        async with self.Connection.get_session() as session:
            async with session.post(
                    url,
                    json=request_body,
                    headers={'Content-Type': 'application/json'}
            ) as response:

                if response.status != 200:
                    data = await response.text()
                    L.error("Failed to fetch data from ElasticSearch: {} from {}\n{}".format(response.status, url, data))
                    return

                msg = await response.json()

        aggs = msg['aggregations']

        if len(aggs) == 0:
            return

        start_name = list(aggs.keys())[0]
        start = aggs[start_name]

        path = {}
        await self.process_aggs(path, start_name, start)

[docs]    async def process_aggs(self, path, aggs_name, aggs):
        """
        Description:

        **Parameters**

        path :

        aggs_name :

        agss :

        """
        if 'buckets' in aggs:
            await self.process_buckets(path, aggs_name, aggs["buckets"])

        if 'value' in aggs:
            path[aggs_name] = aggs['value']

            event = {}
            event.update(path)
            await self.process(event)
            path.pop(aggs_name)

[docs]    async def process_buckets(self, path, parent, buckets):
        """
        Recursive function for buckets processing.
        It iterates through keys of the dictionary, looking for 'buckets' or 'value'.
        If there are 'buckets', calls itself, if there is 'value', calls process_aggs
        and sends an event to process

        **Parameters**

        path :

        parent :

        buckets :

        """
        for bucket in buckets:
            for k in bucket.keys():
                if k == 'key':
                    path[parent] = bucket[k]
                elif isinstance(bucket[k], dict):
                    await self.process_aggs(path, k, bucket[k])