Source code for pallas.client

# Copyright 2020 Akamai Technologies, Inc
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import annotations

import copy
import logging
import time
from typing import Iterable

from pallas.caching import AthenaCache
from pallas.info import QueryInfo
from pallas.proxies import AthenaProxy
from pallas.results import QueryResults
from pallas.sql import (
    PARAMETERS,
    is_select,
    normalize_sql,
    quote,
    substitute_parameters,
)
from pallas.utils import Fibonacci

logger = logging.getLogger(__name__)


[docs]class Query:
    """
    Athena query

    Provides access to one query execution.
    It can be used to monitor status of the query results
    or retrieving results when the execution finishes.

    Instances of this class are returned by :meth:`Athena.submit`
    and :meth:`Athena.get_query` methods.
    You should not need to create this class directly.

    :param execution_id: Athena query execution ID.
    :param proxy: an internal proxy to execute queries
    :param cache: a cache instance
    """

    #: Delays in seconds between for checking query status.
    backoff: Iterable[int] = Fibonacci(max_value=60)

    #: Whether to kill this query on KeyboardInterrupt
    #:
    #: Initially set to :attr:`Athena.kill_on_interrupt`.
    kill_on_interrupt: bool = False

    _execution_id: str
    _proxy: AthenaProxy
    _cache: AthenaCache

    _info: QueryInfo | None = None

    def __init__(
        self,
        execution_id: str,
        *,
        proxy: AthenaProxy,
        cache: AthenaCache,
    ) -> None:
        self._execution_id = execution_id
        self._proxy = proxy
        self._cache = cache

    def __repr__(self) -> str:
        return f"<{type(self).__name__}: execution_id={self.execution_id!r}>"

    @property
    def execution_id(self) -> str:
        """
        Athena query execution ID.

        This ID can be used to retrieve this query later using
        the :meth:`.Athena.get_query()` method.
        """
        return self._execution_id

[docs]    def get_info(self) -> QueryInfo:
        """
        Retrieve information about this query execution.

        Returns a status of this query with other information.
        """
        # Query info is cached if the query finished and cannot change.
        if self._info is not None:
            return self._info
        info = self._proxy.get_query_execution(self._execution_id)
        if info.finished:
            self._info = info
        return info

[docs]    def get_results(self) -> QueryResults:
        """
        Download results of this query execution.

        Cached results can be returned, if the caching was configured.
        Only SELECT queries are cached.

        Waits until this query execution finishes and downloads results.
        Raises :class:`.AthenaQueryError` if the query failed.
        """
        # When a user calls athena.get_query(execution_id).get_results(),
        # we have to look into the cache without knowing what SQL was executed,
        # so whether the query is cacheable.
        results = self._cache.load_results(self._execution_id)
        if results is not None:
            logger.info(
                f"Query {self._execution_id!r} results loaded from cache:"
                f" {len(results)} rows"
            )
            return results
        info = self._join_proxy()
        results = self._proxy.get_query_results(info)
        should_cache = is_select(info.sql)
        if should_cache:
            self._cache.save_results(self._execution_id, results)
        logger.info(
            f"Query {self._execution_id!r} results retrieved: {len(results)} rows"
        )
        return results

[docs]    def kill(self) -> None:
        """
        Kill this query execution.

        This is a non-blocking operation.
        It does not wait until the query is killed.
        """
        self._proxy.stop_query_execution(self._execution_id)

[docs]    def join(self) -> None:
        """
        Wait until this query execution finishes.

        Raises :class:`.AthenaQueryError` if the query failed.
        """
        # When we have results locally, exit without calling any AWS API.
        if self._cache.has_results(self._execution_id):
            return
        self._join_proxy()

    def _join_proxy(self) -> QueryInfo:
        backoff = iter(self.backoff)
        while True:
            info = self.get_info()
            if info.finished:
                logger.info(f"Query {self._execution_id!r} finished: {info}")
                info.check()
                return info
            try:
                time.sleep(next(backoff))
            except KeyboardInterrupt:
                if not self.kill_on_interrupt:
                    raise
                # Catch only the first KeyboardInterrupt
                self.kill_on_interrupt = False
                self.kill()
                self.join()


[docs]class Athena:
    """
    Athena client.

    Provides methods to execute SQL queries in AWS Athena,
    with an optional caching and other helpers.

    Can be used as a blocking or a non-blocking client.

    Use :func:`.setup` or :func:`.environ_setup` to construct
    this class without touching Pallas internals.

    :param proxy: an internal proxy to execute queries
    """

    quote = staticmethod(quote)

    #: Name of Athena database to be be queried.
    #:
    #: Can be overridden in SQL.
    database: str | None = None

    #: Name of Athena workgroup.
    #:
    #: Workgroup can set resource limits or override output location.
    #: When None, defaults to the Athena default workgroup.
    workgroup: str | None = None

    #: URI of output location on S3.
    #:
    #: Optional if an output location is specified for :attr:`.workgroup`.
    output_location: str | None = None

    #: Whether to normalize queries before execution.
    normalize: bool = True

    #: Whether to kill queries on KeyboardInterrupt
    kill_on_interrupt: bool = True

    _proxy: AthenaProxy
    _cache: AthenaCache

    def __init__(self, proxy: AthenaProxy) -> None:
        self._proxy = proxy
        self._cache = AthenaCache()

    def __repr__(self) -> str:
        parts = [
            f"database={self.database!r}",
            f"workgroup={self.workgroup!r}",
            f"output_location={self.output_location!r}",
        ]
        return f"<{type(self).__name__}: {', '.join(parts)}>"

    @property
    def proxy(self) -> AthenaProxy:
        return self._proxy

    @property
    def cache(self) -> AthenaCache:
        """
        Cache implementation.

        It is possible to update properties of the :attr:`.cache`
        attribute to reconfigure caching in place.

        Alternatively, the :meth:`.using` method can apply
        a new configuration without affecting an existing instance.

        :rtype: :class:`.AthenaCache`
        """
        return self._cache

[docs]    def using(
        self,
        *,
        database: str | None = None,
        workgroup: str | None = None,
        output_location: str | None = None,
        normalize: bool | None = None,
        kill_on_interrupt: bool | None = None,
        cache_enabled: bool | None = None,
        cache_read: bool | None = None,
        cache_write: bool | None = None,
        cache_failed: bool | None = None,
    ) -> Athena:
        """
        Crate a new instance with an updated configuration.

        This method can be useful if you need to override a configuration
        for one query, but you do not want to affect future queries.

        :param database: name of Athena database to be be queried.
        :param workgroup: name of Athena workgroup.
        :param output_location: URI of output location on S3.
        :param normalize: whether to normalize queries before execution.
        :param kill_on_interrupt: whether to kill queries on KeyboardInterrupt
        :param cache_enabled: whether a cache should be used.
        :param cache_read: whether a cache should be read.
        :param cache_write: whether a cache should be written.
        :param cache_failed: whether to return failed queries found in cache.
        :return: an updated copy of this client
        """
        other = copy.copy(self)
        other._cache = copy.copy(self._cache)
        if database is not None:
            other.database = database
        if workgroup is not None:
            other.workgroup = workgroup
        if output_location is not None:
            other.output_location = output_location
        if normalize is not None:
            other.normalize = normalize
        if kill_on_interrupt is not None:
            other.kill_on_interrupt = kill_on_interrupt
        if cache_enabled is not None:
            other._cache.enabled = cache_enabled
        if cache_read is not None:
            other._cache.read = cache_read
        if cache_write is not None:
            other._cache.write = cache_write
        if cache_failed is not None:
            other._cache.failed = cache_failed
        return other

[docs]    def execute(
        self,
        operation: str,
        parameters: PARAMETERS = None,
    ) -> QueryResults:
        """
        Execute a query and return results.

        This is a blocking method that waits until the query finishes.

        Cached results or results from an existing query can be returned,
        if the caching was configured. Only SELECT queries are cached.

        Raises :class:`.AthenaQueryError` if the query fails.

        :param operation: an SQL query to be executed
            Can contain ``%s`` or ``%(key)s`` placeholders for substitution
            by *parameters*.
        :param parameters: parameters to substitute in *operation*.
            All substitute parameters are quoted appropriately.
            See the :meth:`.quote` method for a supported parameter types.
        :type parameters: Union[None, Tuple[SQL_SCALAR, ...], Mapping[str, SQL_SCALAR]]
        :return: query results
        """
        return self.submit(operation, parameters).get_results()

[docs]    def submit(
        self,
        operation: str,
        parameters: PARAMETERS = None,
    ) -> Query:
        """
        Submit a query and return.

        This is a non-blocking method that starts a query and returns.
        Returns a :class:`Query` instance for monitoring query execution
        and downloading results later.

        An existing query can be returned, if the caching was configured.
        Only SELECT queries are cached.

        :param operation: an SQL query to be executed
            Can contain ``%s`` or ``%(key)s`` placeholders for substitution
            by *parameters*.
        :param parameters: parameters to substitute in *operation*.
            All substitute parameters are quoted appropriately.
            See the :meth:`.quote` method for a supported parameter types.
        :type parameters: Union[None, Tuple[SQL_SCALAR, ...], Mapping[str, SQL_SCALAR]]
        :return: a query instance
        """
        sql = self._get_sql(operation, parameters)
        query = self._get_cached_query(sql)
        if query:
            logger.info(f"Query {query.execution_id!r} found in cache.")
            return query
        execution_id = self._proxy.start_query_execution(
            sql,
            database=self.database,
            workgroup=self.workgroup,
            output_location=self.output_location,
        )
        self._set_cached_query(sql, execution_id)
        query = self.get_query(execution_id)
        logger.info(f"Query {query.execution_id!r} submitted.")
        return query

[docs]    def get_query(self, execution_id: str) -> Query:
        """
        Get a previously submitted query execution.

        This method can be used to retrieve a query executed in the past.
        Because Athena stores results in S3 and does not delete them by default,
        it is possible to download results until they are manually deleted.

        :param execution_id: an Athena query execution ID.
        :return: a query instance
        """
        query = Query(execution_id, proxy=self._proxy, cache=self._cache)
        query.kill_on_interrupt = self.kill_on_interrupt
        return query

    def _get_sql(self, operation: str, parameters: PARAMETERS) -> str:
        sql = substitute_parameters(operation, parameters)
        if self.normalize:
            sql = normalize_sql(sql)
        return sql

    def _get_cached_query(self, sql: str) -> Query | None:
        if not is_select(sql):
            return None
        execution_id = self._cache.load_execution_id(self.database, sql)
        if execution_id is None:
            return None
        query = self.get_query(execution_id)
        if not self._cached_query_valid(query):
            return None
        return query

    def _set_cached_query(self, sql: str, execution_id: str) -> None:
        if not is_select(sql):
            return
        self._cache.save_execution_id(self.database, sql, execution_id)

    def _cached_query_valid(self, query: Query) -> bool:
        """
        Return True if this query succeeded or can succeeded.
        """
        if self._cache.has_results(query.execution_id):
            return True
        info = query.get_info()
        if info.state == "CANCELLED":
            return False
        if info.state == "FAILED" and not self.cache.failed:
            return False
        return True
Pallas

Navigation

Related Topics

Source code for pallas.client