Source code for sparkly.testing

#
# Copyright 2017 Tubular Labs, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

import collections
import contextlib
import copy
import difflib
from functools import partial, total_ordering
import importlib
import json
import logging
import math
import operator
import os
import pprint
import shutil
import sys
import tempfile
from unittest import TestCase
from unittest.util import safe_repr
import warnings

from pyspark.sql import types as T
import six

from sparkly import SparklySession
from sparkly.exceptions import FixtureError
from sparkly.utils import kafka_get_topics_offsets

if sys.version_info.major == 3:
    from http.client import HTTPConnection
else:
    from httplib import HTTPConnection

try:
    from cassandra.cluster import Cluster
    CASSANDRA_FIXTURES_SUPPORT = True
except ImportError:
    CASSANDRA_FIXTURES_SUPPORT = False

try:
    import pymysql as connector
    MYSQL_FIXTURES_SUPPORT = True
except ImportError:
    try:
        import mysql.connector as connector
        MYSQL_FIXTURES_SUPPORT = True
    except ImportError:
        MYSQL_FIXTURES_SUPPORT = False

try:
    from kafka import KafkaProducer, SimpleClient
    KAFKA_FIXTURES_SUPPORT = True
except ImportError:
    KAFKA_FIXTURES_SUPPORT = False


logger = logging.getLogger()


_test_session_cache = None


[docs]class SparklyTest(TestCase):
    """Base test for spark scrip tests.

    Initialize and shut down Session specified in `session` attribute.

    Example:
        >>> from pyspark.sql import types as T
        >>> class MyTestCase(SparklyTest):
        ...     def test(self):
        ...         self.assertRowsEqual(
        ...              self.spark.sql('SELECT 1 as one').collect(),
        ...              [T.Row(one=1)],
        ...         )
    """
    session = SparklySession

    class_fixtures = []
    fixtures = []
    maxDiff = None

    # (str|None) import the function/class to be tested programmatically
    # by specifying the path to it here, e.g. 'module_a.submodule_b.my_func'
    test_target = None

    @classmethod
    def setup_session(cls):
        return cls.session({
            # Use in-memory hive metastore (faster tests).
            'spark.hadoop.javax.jdo.option.ConnectionURL':
                'jdbc:derby:memory:databaseName=metastore_db;create=true',
            'spark.hadoop.javax.jdo.option.ConnectionDriverName':
                'org.apache.derby.jdbc.EmbeddedDriver',

            # Isolate the warehouse inside of a random temporary directory (no side effects).
            'spark.sql.warehouse.dir': tempfile.mkdtemp(suffix='sparkly'),

            # Reduce number of shuffle partitions (faster tests).
            'spark.sql.shuffle.partitions': '4',
        })

    @classmethod
    def _init_session(cls):
        # In case if project has a mix of SparklyTest and SparklyGlobalContextTest-based tests
        global _test_session_cache
        if _test_session_cache:
            logger.info('Found a global session, stopping it %r', _test_session_cache)
            _test_session_cache.stop()
            _test_session_cache = None

        cls.spark = cls.setup_session()

    @classmethod
    def setUpClass(cls):
        super(SparklyTest, cls).setUpClass()

        cls._init_session()

        for fixture in cls.class_fixtures:
            fixture.setup_data()

        # HACK: When pyspark.sql.functions.udf is used as a decorator
        #   it is evaluated on import time; this has the side effect of
        #   creating a spark context if one doesn't exist, messing up
        #   with this class creating its own for test purposes. As a
        #   result, any transformations to be tested are imported here
        #   programmatically after the test class initialization if
        #   the user wishes.
        if not cls.test_target:
            return

        test_module_path, test_target = cls.test_target.rsplit('.', 1)
        test_module = importlib.import_module(test_module_path)
        setattr(sys.modules[cls.__module__], test_target, getattr(test_module, test_target))

    @classmethod
    def tearDownClass(cls):
        cls.spark.stop()
        super(SparklyTest, cls).tearDownClass()

        for fixture in cls.class_fixtures:
            fixture.teardown_data()

    def setUp(self):
        for fixture in self.fixtures:
            fixture.setup_data()

    def tearDown(self):
        for fixture in self.fixtures:
            fixture.teardown_data()

[docs]    def assertDataFrameEqual(self,
                             actual_df,
                             expected_data,
                             fields=None,
                             ordered=False):
        """Ensure that DataFrame has the right data inside.

        ``assertDataFrameEqual`` is being deprecated. Please use
        ``assertRowsEqual`` instead.

        Args:
            actual_df (pyspark.sql.DataFrame|list[pyspark.sql.Row]): Dataframe to test data in.
            expected_data (list[dict]): Expected dataframe rows defined as dicts.
            fields (list[str]): Compare only certain fields.
            ordered (bool): Does order of rows matter?
        """
        warnings.warn(
            'assertDataFrameEqual is being deprecated. Please use assertRowsEqual instead.',
            DeprecationWarning,
        )

        if fields:
            actual_df = actual_df.select(*fields)

        actual_rows = actual_df.collect() if hasattr(actual_df, 'collect') else actual_df
        actual_data = [row.asDict(recursive=True) for row in actual_rows]

        return self.assertRowsEqual(
            actual_data,
            expected_data,
            ignore_order=not ordered,
            ignore_order_depth=1,
            atol=0,
            rtol=0,
            equal_nan=False,
            ignore_nullability=False,
        )

[docs]    def assertRowsEqual(self,
                        first,
                        second,
                        msg=None,
                        # ordering parameters
                        ignore_order=True,
                        ignore_order_depth=None,
                        # float comparison parameters
                        atol=0,
                        rtol=1e-07,
                        equal_nan=True,
                        # DataType comparison parameters
                        ignore_nullability=True):
        """Assert equal on steroids.

        Extend this classic function signature to work better with
        comparisons involving rows, datatypes, dictionaries, lists and
        floats by:

        - ignoring the order of lists and datatypes recursively,
        - comparing floats within a given tolerance,
        - assuming NaNs are equal,
        - ignoring the nullability requirements of datatypes (since
          Spark can be inaccurate when inferring it),
        - providing better diffs for rows and datatypes.

        Float comparisons are inspired by NumPy's ``assert_allclose``. The
        main formula used is
        ``| float1 - float2 | <= atol + rtol * float2``.

        Args:
            first: see ``unittest.TestCase.assertEqual``.
            second: see ``unittest.TestCase.assertEqual``.
            msg: see ``unittest.TestCase.assertEqual``.
            ignore_order (bool|True): ignore the order in lists and
                datatypes (rows, dicts are inherently orderless).
            ignore_order_depth (int|None): if ignore_order is true, do
                ignore order up to this level of nested lists or
                datatypes (exclusive). Setting this to 0 or None means
                ignore order infinitely, 1 means ignore order only at the
                top level, 2 will ignore order within lists of lists and
                so on. Default is ignore order arbitrarily deep.
            atol (int, float|0): Absolute tolerance in float comparisons.
            rtol (int, float|1e-07): Relative tolerance in float
                comparisons.
            equal_nan (bool|True): If set, NaNs will compare equal.
            ignore_nullability (bool|True): If set, ignore all
                nullability fields in dataTypes. This includes
                ``containsNull`` in arrays, ``valueContainsNull`` in maps
                and ``nullable`` in struct fields.

        Returns:
            None iff the two objects are equal.

        Raises
            AssertionError: iff the two objects are not equal. See
            ``unittest.TestCase.assertEqual`` for details.
        """
        # Our approach here is to redefine the 5 container objects that
        # this function expects to work with - floats, dataTypes, rows,
        # dicts and lists - to introduce generic ordering and extend
        # the meaning of equality where applicable. We can then change
        # all such objects to our custom containers, provide new asserters
        # for some, and then feed them to the vanilla assertEqual.

        # Define our custom containers
        def cast_to_test_friendly_container(value, ignore_order_depth):
            if isinstance(value, float):
                return Float(value)
            if isinstance(value, T.DataType):
                return DataType(value, ignore_order_depth)
            if isinstance(value, T.Row):
                return Row(value, ignore_order_depth)
            if isinstance(value, dict):
                return Dict(value, ignore_order_depth)
            if isinstance(value, list):
                return List(value, ignore_order_depth)
            return value

        @total_ordering
        class Float(object):
            def __init__(self, f):
                self._f = f

            def __eq__(self, other):
                return other is not None and (
                    (equal_nan and math.isnan(self._f) and math.isnan(other._f)) or
                    abs(self._f - other._f) <= atol + rtol * abs(other._f)
                )

            def __lt__(self, other):
                return self._f != other._f and self._f < other._f

            def __repr__(self):
                return repr(self._f)


        @total_ordering
        class DataType(object):
            def __init__(self, dt, ignore_order_depth=0):
                # update recursively all T.StructTypes to define their
                # fields in sorted order
                def _sort_structs(dt, ignore_order_depth):
                    if ignore_order_depth == 0:
                        return dt
                    if dt.typeName() == 'array':
                        return T.ArrayType(
                            elementType=_sort_structs(dt.elementType, ignore_order_depth),
                            containsNull=ignore_nullability or dt.containsNull,
                        )
                    if dt.typeName() == 'map':
                        return T.MapType(
                            keyType=_sort_structs(dt.keyType, ignore_order_depth),
                            valueType=_sort_structs(dt.valueType, ignore_order_depth),
                            valueContainsNull=ignore_nullability or dt.valueContainsNull,
                        )
                    if dt.typeName() == 'struct':
                        return T.StructType([
                            _sort_structs(f, ignore_order_depth - 1)
                            for f in sorted(dt.fields, key=lambda f: f.name)
                        ])
                    if dt.typeName() == 'structf':
                        return T.StructField(
                            dt.name,
                            _sort_structs(dt.dataType, ignore_order_depth),
                            nullable=ignore_nullability or dt.nullable,
                            metadata=dt.metadata,
                        )
                    return dt

                self._dt = _sort_structs(dt, ignore_order_depth)

            def __eq__(self, other):
                return self._dt == other._dt

            def __ne__(self, other):
                # Only needed for Py27...
                return self._dt != other._dt

            def __lt__(self, other):
                return other is not None and repr(self._dt) < repr(other._dt)

            def __repr__(self):
                return repr(self._dt)

            def pretty_repr(self):
                # useful to get a nice diff later
                return pprint.pformat(self._dt.jsonValue()).splitlines()


        @total_ordering
        class Row(collections.OrderedDict):
            def __init__(self, row, ignore_order_depth=0):
                super(Row, self).__init__(
                    (field, cast_to_test_friendly_container(row[field], ignore_order_depth))
                    # Rows currently store their fields in order either
                    # way but we ensure this is the case here too
                    for field in sorted(row.__fields__)
                )

            def __lt__(self, other):
                return other is not None and (
                    List(zip(self.keys(), self.values())) < List(zip(other.keys(), other.values()))
                )

            def __repr__(self):
                return 'Row({})'.format(', '.join(['{!r}={!r}'.format(*i) for i in self.items()]))


        @total_ordering
        class Dict(collections.OrderedDict):
            def __init__(self, dictionary, ignore_order_depth=0):
                super(Dict, self).__init__(
                    sorted([
                        (
                            cast_to_test_friendly_container(k, ignore_order_depth),
                            cast_to_test_friendly_container(v, ignore_order_depth),
                        )
                        for k, v in dictionary.items()
                    ])
                )

            def __lt__(self, other):
                return other is not None and (
                    List(zip(self.keys(), self.values())) < List(zip(other.keys(), other.values()))
                )

            def __repr__(self):
                return '{{{}}}'.format(', '.join(['{!r}: {!r}'.format(*i) for i in self.items()]))


        @total_ordering
        class List(list):
            def __init__(self, sequence, ignore_order_depth=0):
                if ignore_order_depth == 0:
                    _sort_or_pass = lambda l: l
                else:
                    ignore_order_depth = ignore_order_depth - 1
                    _sort_or_pass = sorted

                super(List, self).__init__(
                    _sort_or_pass([
                        cast_to_test_friendly_container(v, ignore_order_depth)
                        for v in sequence
                    ])
                )

            def __lt__(self, other):
                # None is not a nice value to compare to when trying to
                # order things, as TypeErrors are raised. Instead, we
                # transform it to a tuple - if the first entry doesn't
                # match (is it None?) we don't need to compare further
                def _neutralize_none(entry):
                    if isinstance(entry, tuple):
                        return entry is None, tuple(_neutralize_none(e) for e in entry)
                    return entry is None, entry
                return [_neutralize_none(e) for e in self] < [_neutralize_none(e) for e in other]


        # Define new equality asserters for floats, rows and datatypes
        def assert_float_equal(self, float1, float2, msg=None):
            if not isinstance(float1, Float):
                float1 = Float(float1)
            if not isinstance(float2, Float):
                float2 = Float(float2)

            if float1 == float2:
                return

            if not atol and not rtol:
                standard_msg = '{} != {}'.format(float1, float2)
            else:
                standard_msg = (
                    '{} != {} within absolute tolerance {} and relative tolerance {}'
                    .format(float1, float2, atol, rtol)
                )
            self.fail(self._formatMessage(msg, standard_msg))

        def assert_row_equal(self, row1, row2, msg=None):
            self.assertEqual(Row(row1), Row(row2), msg)

        def assert_datatype_equal(self, dt1, dt2, msg=None):
            if not isinstance(dt1, DataType):
                dt1 = DataType(dt1)
            if not isinstance(dt2, DataType):
                dt2 = DataType(dt2)

            if dt1 != dt2:
                standard_msg = '{} != {}'.format(safe_repr(dt1, True), safe_repr(dt2, True))
                diff = '\n' + '\n'.join(difflib.ndiff(dt1.pretty_repr(), dt2.pretty_repr()))
                standard_msg = self._truncateMessage(standard_msg, diff)
                self.fail(self._formatMessage(msg, standard_msg))

        # Create a context manager to temporarily register our asserters,
        # then restore them to defaults after this function is finished
        # since they might depend on specific parameters provided here
        # (e.g., atol/rtol for floats)
        @contextlib.contextmanager
        def temp_add_type_equality_func(self, typeobj, function):
            old_asserter = self._type_equality_funcs.get(typeobj)
            self.addTypeEqualityFunc(typeobj, function)
            yield
            self.addTypeEqualityFunc(typeobj, old_asserter)

        temp_add_type_equality_func = partial(temp_add_type_equality_func, self)

        # Register equality asserters
        with temp_add_type_equality_func(float, partial(assert_float_equal, self)), \
             temp_add_type_equality_func(Float, partial(assert_float_equal, self)), \
             temp_add_type_equality_func(DataType, partial(assert_datatype_equal, self)), \
             temp_add_type_equality_func(T.DataType, partial(assert_datatype_equal, self)), \
             temp_add_type_equality_func(Row, self.assertDictEqual), \
             temp_add_type_equality_func(T.Row, partial(assert_row_equal, self)), \
             temp_add_type_equality_func(Dict, self.assertDictEqual), \
             temp_add_type_equality_func(List, self.assertListEqual):

            # And finally (phew!) run the actual comparisons
            ignore_order_depth = ignore_order_depth or -1 if ignore_order else 0

            first = cast_to_test_friendly_container(first, ignore_order_depth)
            second = cast_to_test_friendly_container(second, ignore_order_depth)

            self.assertEqual(first, second, msg)


[docs]class SparklyGlobalSessionTest(SparklyTest):
    """Base test case that keeps a single instance for the given session class across all tests.

    Integration tests are slow, especially when you have to start/stop Spark context
    for each test case. This class allows you to reuse Spark session across multiple test cases.
    """
    @classmethod
    def _init_session(cls):
        global _test_session_cache

        if _test_session_cache and cls.session == type(_test_session_cache):
            logger.info('Reusing the global session for %r', cls.session)
            spark = _test_session_cache
        else:
            if _test_session_cache:
                logger.info('Stopping the previous global session %r', _test_session_cache)
                _test_session_cache.stop()

            logger.info('Starting the new global session for %r', cls.session)
            spark = _test_session_cache = cls.setup_session()

        cls.spark = spark

    @classmethod
    def tearDownClass(cls):
        cls.spark.catalog.clearCache()

        for fixture in cls.class_fixtures:
            fixture.teardown_data()


[docs]class Fixture(object):
    """Base class for fixtures.

    Fixture is a term borrowed from Django tests,
    it's data loaded into database for integration testing.
    """

[docs]    def setup_data(self):
        """Method called to load data into database."""
        raise NotImplementedError()

[docs]    def teardown_data(self):
        """Method called to remove data from database which was loaded by `setup_data`."""
        raise NotImplementedError()

    def __enter__(self):
        self.setup_data()

    def __exit__(self, exc_type, exc_val, exc_tb):
        self.teardown_data()

    @classmethod
    def read_file(cls, path):
        with open(path) as f:
            data = f.read()
        return data


[docs]class CassandraFixture(Fixture):
    """Fixture to load data into cassandra.

    Notes:
        * Depends on cassandra-driver.

    Examples:

           >>> class MyTestCase(SparklyTest):
           ...      fixtures = [
           ...          CassandraFixture(
           ...              'cassandra.host',
           ...              absolute_path(__file__, 'resources', 'setup.cql'),
           ...              absolute_path(__file__, 'resources', 'teardown.cql'),
           ...          )
           ...      ]
           ...

           >>> class MyTestCase(SparklyTest):
           ...      data = CassandraFixture(
           ...          'cassandra.host',
           ...          absolute_path(__file__, 'resources', 'setup.cql'),
           ...          absolute_path(__file__, 'resources', 'teardown.cql'),
           ...      )
           ...      def setUp(self):
           ...          data.setup_data()
           ...      def tearDown(self):
           ...          data.teardown_data()
           ...

           >>> def test():
           ...     fixture = CassandraFixture(...)
           ...     with fixture:
           ...        test_stuff()
           ...
    """

    def __init__(self, host, setup_file, teardown_file):
        if not CASSANDRA_FIXTURES_SUPPORT:
            raise NotImplementedError('cassandra-driver package isn\'t available. '
                                      'Use pip install sparkly[test] to fix it.')
        self.host = host
        self.setup_file = setup_file
        self.teardown_file = teardown_file

    def _execute(self, statements):
        cluster = Cluster([self.host])
        session = cluster.connect()
        for statement in statements.split(';'):
            if bool(statement.strip()):
                session.execute(statement.strip())

    def setup_data(self):
        self._execute(self.read_file(self.setup_file))

    def teardown_data(self):
        self._execute(self.read_file(self.teardown_file))


[docs]class ElasticFixture(Fixture):
    """Fixture for elastic integration tests.

    Examples:

           >>> class MyTestCase(SparklyTest):
           ...      fixtures = [
           ...          ElasticFixture(
           ...              'elastic.host',
           ...              'es_index',
           ...              'es_type',
           ...              '/path/to/mapping.json',
           ...              '/path/to/data.json',
           ...          )
           ...      ]
           ...
    """

    def __init__(self, host, es_index, es_type, mapping=None, data=None, port=None):
        self.host = host
        self.port = port or 9200
        self.es_index = es_index
        self.es_type = es_type
        self.mapping = mapping
        self.data = data

    def setup_data(self):
        if self.mapping:
            self._request(
                'PUT',
                '/{}'.format(self.es_index),
                json.dumps({
                    'settings': {
                        'index': {
                            'number_of_shards': 1,
                            'number_of_replicas': 1,
                        }
                    }
                }),
            )
            self._request(
                'PUT',
                '/{}/_mapping/{}'.format(self.es_index, self.es_type),
                self.read_file(self.mapping),
            )

        if self.data:
            self._request(
                'POST',
                '/_bulk',
                self.read_file(self.data),
            )
            self._request(
                'POST',
                '/_refresh',
            )

    def teardown_data(self):
        self._request(
            'DELETE',
            '/{}'.format(self.es_index),
        )

    def _request(self, method, url, body=None):
        connection = HTTPConnection(self.host, port=self.port)
        connection.request(method, url, body)
        response = connection.getresponse()
        if sys.version_info.major == 3:
            code = response.code
        else:
            code = response.status

        if code != 200:
            raise FixtureError('{}: {}'.format(code, response.read()))


[docs]class MysqlFixture(Fixture):
    """Fixture for mysql integration tests.

    Notes:
        * depends on PyMySql lib.

    Examples:

           >>> class MyTestCase(SparklyTest):
           ...      fixtures = [
           ...          MysqlFixture('mysql.host', 'user', 'password', '/path/to/data.sql')
           ...      ]
           ...      def test(self):
           ...          pass
           ...
    """

    def __init__(self, host, user, password=None, data=None, teardown=None):
        if not MYSQL_FIXTURES_SUPPORT:
            raise NotImplementedError('PyMySQL package isn\'t available. '
                                      'Use pip install sparkly[test] to fix it.')
        self.host = host
        self.user = user
        self.password = password
        self.data = data
        self.teardown = teardown

    def _execute(self, statements):
        ctx = connector.connect(
            user=self.user,
            password=self.password,
            host=self.host,
        )
        cursor = ctx.cursor()
        cursor.execute(statements)
        ctx.commit()
        cursor.close()
        ctx.close()

    def setup_data(self):
        self._execute(self.read_file(self.data))

    def teardown_data(self):
        self._execute(self.read_file(self.teardown))


[docs]class KafkaFixture(Fixture):
    """Fixture for kafka integration tests.

    Notes:
        * depends on kafka-python lib.
        * json file should contain array of dicts: [{'key': ..., 'value': ...}]

    Examples:

        >>> class MyTestCase(SparklySession):
        ...     fixtures = [
        ...         KafkaFixture(
        ...             'kafka.host', 'topic',
        ...             key_serializer=..., value_serializer=...,
        ...             data='/path/to/data.json',
        ...         )
        ...     ]

    """
    def __init__(self, host, port=9092, topic=None,
                 key_serializer=None, value_serializer=None,
                 data=None):
        """Constructor.

        Args:
            host (str): Kafka host.
            port (int): Kafka port.
            topic (str): Kafka topic.
            key_serializer (function): Converts python data structure to bytes,
                applied to message key.
            value_serializer (function): Converts python data structure to bytes,
                applied to message value.
            data (str): Path to json file with data.
        """
        if not KAFKA_FIXTURES_SUPPORT:
            raise NotImplementedError('kafka-python package isn\'t available. '
                                      'Use pip install sparkly[test] to fix it.')

        self.host = host
        self.port = port
        self.topic = topic
        self.key_serializer = key_serializer
        self.value_serializer = value_serializer
        self.data = data

    def _publish_data(self, data):
        producer = KafkaProducer(bootstrap_servers='kafka.docker',
                                 key_serializer=self.key_serializer,
                                 value_serializer=self.value_serializer)
        for item in data:
            producer.send(self.topic, key=item['key'], value=item['value'])

        producer.flush()
        producer.close()

    def setup_data(self):
        data = [json.loads(item) for item in self.read_file(self.data).strip().split('\n')]
        self._publish_data(data)

    def teardown_data(self):
        pass


[docs]class KafkaWatcher:
    """Context manager that tracks Kafka data published to a topic

    Provides access to the new items that were written to a kafka topic by code running
    within this context.

    NOTE: This is mainly useful in integration test cases and may produce unexpected results in
    production environments, since there are no guarantees about who else may be publishing to
    a kafka topic.

    Usage:
        my_deserializer = lambda item: json.loads(item.decode('utf-8'))
        kafka_watcher = KafkaWatcher(
            my_sparkly_session,
            expected_output_dataframe_schema,
            my_deserializer,
            my_deserializer,
            'my.kafkaserver.net',
            'my_kafka_topic',
        )
        with kafka_watcher:
            # do stuff that publishes messages to 'my_kafka_topic'
        self.assertEqual(kafka_watcher.count, expected_number_of_new_messages)
        self.assertDataFrameEqual(kafka_watcher.df, expected_df)
    """

    def __init__(
        self,
        spark,
        df_schema,
        key_deserializer,
        value_deserializer,
        host,
        topic,
        port=9092,
    ):
        """Initialize context manager

        Parameters `key_deserializer` and `value_deserializer` are callables
        which get bytes as input and should return python structures as output.

        Args:
            spark (SparklySession): currently active SparklySession
            df_schema (pyspark.sql.types.StructType): schema of dataframe to be generated
            key_deserializer (function): function used to deserialize the key
            value_deserializer (function): function used to deserialize the value
            host (basestring): host or ip address of the kafka server to connect to
            topic (basestring): Kafka topic to monitor
            port (int): port number of the Kafka server to connect to
        """
        self.spark = spark
        self.topic = topic
        self.df_schema = df_schema
        self.key_deser, self.val_deser = key_deserializer, value_deserializer
        self.host, self.port = host, port
        self._df = None
        self.count = 0

        kafka_client = SimpleClient(host)
        kafka_client.ensure_topic_exists(topic)

    def __enter__(self):
        self._df = None
        self.count = 0
        self.pre_offsets = kafka_get_topics_offsets(
            topic=self.topic,
            host=self.host,
            port=self.port,
        )

    def __exit__(self, e_type, e_value, e_trace):
        self.post_offsets = kafka_get_topics_offsets(
            topic=self.topic,
            host=self.host,
            port=self.port,
        )
        self.count = sum([
            post[2] - pre[2]
            for pre, post in zip(self.pre_offsets, self.post_offsets)
        ])

    @property
    def df(self):
        if not self.count:
            return None
        if not self._df:
            offset_ranges = [
                [pre[0], pre[2], post[2]]
                for pre, post in zip(self.pre_offsets, self.post_offsets)
            ]
            self._df = self.spark.read_ext.kafka(
                topic=self.topic,
                offset_ranges=offset_ranges,
                schema=self.df_schema,
                key_deserializer=self.key_deser,
                value_deserializer=self.val_deser,
                host=self.host,
                port=self.port,
            )
        return self._df