Source code for biothings.web.query.pipeline

import asyncio
import logging
from collections import Counter
from dataclasses import dataclass

import elasticsearch

from biothings.web.query.builder import RawQueryInterrupt
from biothings.web.query.engine import EndScrollInterrupt, RawResultInterrupt

# here this module defines two types of operations supported in
# each query pipeline class, one called "search" which corresponds to
# a key-value pair search, with the sometimes optional parameter "scopes"
# being a list of keys to look for the value, named as parameter "q".
# the other type of query is called "fetch", that looks up documents
# basing on pre-defined fields that function as their mostly unique
# identifiers, it is a special type of "search" that ususally returns
# only one document. in this type of query, "scopes" are absolutely NOT
# provided, and the query term is called "id" in the parameter list.

# this module is called pipeline because
# a) it is a combination of individual query processing stages
# b) during async operations, it functions like the CPU pipeline,
#    more than 1 stage can be busy at a single point in time.

logger = logging.getLogger(__name__)



[docs]
@dataclass(frozen=True)
class QueryPipelineException(Exception):
    code: int = 500
    summary: str = ""
    details: object = None

    # use code here to indicate error types instead of
    # exception types, this reduces the number of potential
    # exception types this module needs to create.
    # furthermore, consider using a superset of HTTP codes,
    # so that error translation from the upper layers
    # is also convenient and straightforward.



[docs]
class QueryPipelineInterrupt(QueryPipelineException):
    def __init__(self, data):
        super().__init__(200, None, data)




[docs]
class QueryPipeline:
    def __init__(self, builder, backend, formatter, **settings):
        self.builder = builder
        self.backend = backend
        self.formatter = formatter
        self.settings = settings


[docs]
    def search(self, q, **options):
        query = self.builder.build(q, **options)
        result = self.backend.execute(query, **options)
        return self.formatter.transform(result, **options)



[docs]
    def fetch(self, id, **options):
        assert options.get("scopes") is None
        result = self.search(id, **options)
        return result




def _simplify_ES_exception(exc, debug=False):
    result = {}
    try:
        root_cause = exc.info.get("error", exc.info)
        root_cause = root_cause["root_cause"][0]["reason"]
        root_cause = root_cause.replace('"', "'").split("\n")
        for index, cause in enumerate(root_cause):
            result["root_cuase_line_" + f"{index:02}"] = cause
    except IndexError:
        pass  # no root cause
    except Exception:
        logger.exception(
            " ".join(
                (
                    "Unexpected error in _simplify_ES_exception.",
                    "Caused by incompatible version, build, etc.",
                    "Update ES exception parsing logic here.",
                )
            )
        )

    if debug:  # raw ES error response
        result["debug"] = exc.info

    return exc.error, result



[docs]
def capturesESExceptions(func):
    async def _(*args, **kwargs):
        try:
            return await func(*args, **kwargs)
        except (
            RawQueryInterrupt,  # correspond to 'rawquery' option
            RawResultInterrupt,  # correspond to 'raw' option
            EndScrollInterrupt,
        ) as exc:
            raise QueryPipelineInterrupt(exc.data)

        except AssertionError as exc:
            # in our application, AssertionError should be internal
            # the individual components raising the error should instead
            # rasie exceptions like ValueError and TypeError for bad input
            logging.exception("FIXME: Unexpected Assertion Error.", exc_info=exc)
            raise QueryPipelineException(500, str(exc) or "N/A")

        except (ValueError, TypeError) as exc:
            raise QueryPipelineException(400, type(exc).__name__, str(exc))

        except elasticsearch.ConnectionError:  # like timeouts..
            raise QueryPipelineException(503)

        except elasticsearch.RequestError as exc:  # 400s
            raise QueryPipelineException(400, *_simplify_ES_exception(exc))

        # it seems like the managed Elasticsearch service by AWS
        # may provide slightly different exceptions when the server
        # is overloaded comparing to that of self-managed ES.

        # this case and most of the handling below can be further studied.
        # most of the exception handlings from this point on are based on
        # experience. further documentation in details will be helpful.

        except elasticsearch.TransportError as exc:  # >400
            if exc.error == "search_phase_execution_exception":
                reason = exc.info.get("caused_by", {}).get("reason", "")

                if "rejected execution" in reason:
                    raise QueryPipelineException(503)
                else:  # unexpected, provide additional information for debug
                    raise QueryPipelineException(500, *_simplify_ES_exception(exc, True))

            elif exc.error == "index_not_found_exception":
                raise QueryPipelineException(500, exc.error)

            elif exc.status_code in (429, "N/A"):
                raise QueryPipelineException(503)
            else:  # unexpected
                raise

    return _




[docs]
class AsyncESQueryPipeline(QueryPipeline):

[docs]
    @capturesESExceptions
    async def search(self, q, **options):
        if isinstance(q, list):  # multisearch
            options["templates"] = (dict(query=_q) for _q in q)
            options["template_miss"] = dict(notfound=True)
            options["template_hit"] = dict()

        query = self.builder.build(q, **options)
        response = await self.backend.execute(query, **options)
        result = self.formatter.transform(response, **options)
        return result



[docs]
    @capturesESExceptions
    async def fetch(self, id, **options):
        if options.get("scopes"):
            raise ValueError("Scopes Not Allowed.")

        # for builder
        options["autoscope"] = True
        # for formatter
        options["version"] = True
        options["score"] = False
        options["one"] = True

        # annotation endpoint should work on fields with reasonable
        # uniqueness of values, like id and symbol fields. because
        # we do not provide pagination for this endpoint, as it is
        # largely unncessary, in the case of matching too many docs
        # for one request, raise an exception instead of providing
        # the user incomplete matches. usually, when this happends,
        # it indicates bad choices of values as the default fields.

        MAX_MATCH = self.settings.get("fetch_max_match", 1000)
        options["size"] = MAX_MATCH + 1  # err when len(res) > MAX

        # "fetch" is a wrapper over "search".
        # ----------------------------------------
        result = await self.search(id, **options)
        # ----------------------------------------

        if isinstance(id, list):  # batch
            counter = Counter(x["query"] for x in result)
            if counter.most_common(1)[0][1] > MAX_MATCH:
                raise QueryPipelineException(500, "Too Many Matches.")

        else:  # single
            if result is None:
                raise QueryPipelineException(404, "Not Found.")

            if isinstance(result, list) and len(result) > MAX_MATCH:
                raise QueryPipelineException(500, "Too Many Matches.")

        return result





[docs]
class ESQueryPipeline(QueryPipeline):  # over async client
    # These implementations may not be performance optimized
    # It is used as a proof of concept and helps the design
    # of upper layer constructs, it also simplifies testing
    # by providing ioloop management, enabling sync access.

    # >>> from biothings.web.query.pipeline import ESQueryPipeline
    # >>> pipeline = ESQueryPipeline()
    #
    # >>> pipeline.fetch("1017", _source=["symbol"])
    # {'_id': '1017', '_version': 1, 'symbol': 'CDK2'}
    #
    # >>> pipeline.search("1017", _source=["symbol"])
    # {
    #   'took': 11,
    #   'total': 1,
    #   'max_score': 4.0133753,
    #   'hits': [
    #       {
    #           '_id': '1017',
    #           '_score': 4.0133753,
    #           'symbol': 'CDK2'
    #       }
    #   ]
    # }

    def __init__(self, builder=None, backend=None, formatter=None, *args, **kwargs):
        if not builder:
            from biothings.web.query.builder import ESQueryBuilder

            builder = ESQueryBuilder()

        if not backend:
            from biothings.web.connections import get_es_client
            from biothings.web.query.engine import ESQueryBackend

            client = get_es_client(async_=True)
            backend = ESQueryBackend(client)

        if not formatter:
            from biothings.web.query.formatter import ESResultFormatter

            formatter = ESResultFormatter()

        super().__init__(builder, backend, formatter, *args, **kwargs)

    def _run_coroutine(self, coro, *args, **kwargs):
        loop = asyncio.get_event_loop()
        pipeline = AsyncESQueryPipeline(self.builder, self.backend, self.formatter)
        return loop.run_until_complete(coro(pipeline, *args, **kwargs))


[docs]
    def search(self, q, **options):
        return self._run_coroutine(AsyncESQueryPipeline.search, q, **options)



[docs]
    def fetch(self, id, **options):
        return self._run_coroutine(AsyncESQueryPipeline.fetch, id, **options)





[docs]
class MongoQueryPipeline(QueryPipeline):
    pass




[docs]
class SQLQueryPipeline(QueryPipeline):
    pass