Source code for amazon.ion.reader_text

# Copyright 2016 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License").
# You may not use this file except in compliance with the License.
# A copy of the License is located at:
#
#    http://aws.amazon.com/apache2.0/
#
# or in the "license" file accompanying this file. This file is
# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
# OF ANY KIND, either express or implied. See the License for the
# specific language governing permissions and limitations under the
# License.

# Python 2/3 compatibility
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import base64
from decimal import Decimal
from collections import defaultdict
from functools import partial

import six
import sys

from amazon.ion.core import Transition, ION_STREAM_INCOMPLETE_EVENT, ION_STREAM_END_EVENT, IonType, IonEvent, \
    IonEventType, IonThunkEvent, TimestampPrecision, timestamp, ION_VERSION_MARKER_EVENT
from amazon.ion.exceptions import IonException
from amazon.ion.reader import BufferQueue, reader_trampoline, ReadEventType, safe_unichr, CodePointArray, CodePoint, \
    _NARROW_BUILD
from amazon.ion.symbols import SymbolToken, TEXT_ION_1_0
from amazon.ion.util import record, coroutine, Enum, _next_code_point, CodePoint

_ord = six.byte2int
_chr = safe_unichr


def _illegal_character(c, ctx, message=''):
    """Raises an IonException upon encountering the given illegal character in the given context.

    Args:
        c (int|None): Ordinal of the illegal character.
        ctx (_HandlerContext):  Context in which the illegal character was encountered.
        message (Optional[str]): Additional information, as necessary.

    """
    container_type = ctx.container.ion_type is None and 'top-level' or ctx.container.ion_type.name
    value_type = ctx.ion_type is None and 'unknown' or ctx.ion_type.name
    if c is None:
        header = 'Illegal token'
    else:
        c = 'EOF' if BufferQueue.is_eof(c) else _chr(c)
        header = 'Illegal character %s' % (c,)
    raise IonException('%s at position %d in %s value contained in %s. %s Pending value: %s'
                       % (header, ctx.queue.position, value_type, container_type, message, ctx.value))


def _defaultdict(dct, fallback=_illegal_character):
    """Wraps the given dictionary such that the given fallback function will be called when a nonexistent key is
    accessed.
    """
    out = defaultdict(lambda: fallback)
    for k, v in six.iteritems(dct):
        out[k] = v
    return out


def _merge_mappings(*args):
    """Merges a sequence of dictionaries and/or tuples into a single dictionary.

    If a given argument is a tuple, it must have two elements, the first of which is a sequence of keys and the second
    of which is a single value, which will be mapped to from each of the keys in the sequence.
    """
    dct = {}
    for arg in args:
        if isinstance(arg, dict):
            merge = arg
        else:
            assert isinstance(arg, tuple)
            keys, value = arg
            merge = dict(zip(keys, [value]*len(keys)))
        dct.update(merge)
    return dct


def _seq(s):
    """Converts bytes to a sequence of integer code points."""
    return tuple(six.iterbytes(s))


_ENCODING = 'utf-8'

# NOTE: the following are stored as sequences of integer code points. This simplifies dealing with inconsistencies
# between how bytes objects are handled in python 2 and 3, and simplifies logic around comparing multi-byte characters.
_WHITESPACE_NOT_NL = _seq(b' \t\v\f')
_WHITESPACE = _WHITESPACE_NOT_NL + _seq(b'\n\r')
_VALUE_TERMINATORS = _seq(b'{}[](),\"\' \t\n\r/')
_SYMBOL_TOKEN_TERMINATORS = _WHITESPACE + _seq(b'/:')
_DIGITS = _seq(b'0123456789')
_BINARY_RADIX = _seq(b'Bb')
_BINARY_DIGITS = _seq(b'01')
_HEX_RADIX = _seq(b'Xx')
_HEX_DIGITS = _DIGITS + _seq(b'abcdefABCDEF')
_DECIMAL_EXPS = _seq(b'Dd')
_FLOAT_EXPS = _seq(b'Ee')
_SIGN = _seq(b'+-')
_TIMESTAMP_YEAR_DELIMITERS = _seq(b'-T')
_TIMESTAMP_DELIMITERS = _seq(b'-:+.')
_TIMESTAMP_OFFSET_INDICATORS = _seq(b'Z+-')
_LETTERS = _seq(b'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ')
_BASE64_DIGITS = _LETTERS + _DIGITS + _seq(b'+/')
_IDENTIFIER_STARTS = _LETTERS + _seq(b'_')  # Note: '$' is dealt with separately.
_IDENTIFIER_CHARACTERS = _IDENTIFIER_STARTS + _DIGITS + _seq(b'$')
_OPERATORS = _seq(b'!#%&*+-./;<=>?@^`|~')
_COMMON_ESCAPES = _seq(b'abtnfrv?0\'"/\\')
_NEWLINES = _seq(b'\r\n')

_UNDERSCORE = _ord(b'_')
_DOT = _ord(b'.')
_COMMA = _ord(b',')
_COLON = _ord(b':')
_SLASH = _ord(b'/')
_ASTERISK = _ord(b'*')
_BACKSLASH = _ord(b'\\')
_CARRIAGE_RETURN = _ord(b'\r')
_NEWLINE = _ord(b'\n')
_DOUBLE_QUOTE = _ord(b'"')
_SINGLE_QUOTE = _ord(b'\'')
_DOLLAR_SIGN = _ord(b'$')
_PLUS = _ord(b'+')
_MINUS = _ord(b'-')
_HYPHEN = _MINUS
_T = _ord(b'T')
_Z = _ord(b'Z')
_T_LOWER = _ord(b't')
_N_LOWER = _ord(b'n')
_F_LOWER = _ord(b'f')
_ZERO = _DIGITS[0]
_OPEN_BRACE = _ord(b'{')
_OPEN_BRACKET = _ord(b'[')
_OPEN_PAREN = _ord(b'(')
_CLOSE_BRACE = _ord(b'}')
_CLOSE_BRACKET = _ord(b']')
_CLOSE_PAREN = _ord(b')')
_BASE64_PAD = _ord(b'=')
_QUESTION_MARK = _ord(b'?')
_UNICODE_ESCAPE_2 = _ord(b'x')
_UNICODE_ESCAPE_4 = _ord(b'u')
_UNICODE_ESCAPE_8 = _ord(b'U')

_ESCAPED_NEWLINE = u''  # An escaped newline expands to nothing.

_MAX_TEXT_CHAR = 0x10ffff
_MAX_CLOB_CHAR = 0x7f
_MIN_QUOTED_CHAR = 0x20

# The following suffixes are used for comparison when a token is found that starts with the first letter in
# the keyword. For example, when a new token starts with 't', the next three characters must match those in
# _TRUE_SUFFIX, followed by an acceptable termination character, in order for the token to match the 'true' keyword.
_TRUE_SUFFIX = _seq(b'rue')
_FALSE_SUFFIX = _seq(b'alse')
_NAN_SUFFIX = _seq(b'an')
_INF_SUFFIX = _seq(b'inf')
_IVM_PREFIX = _seq(b'$ion_')

_IVM_EVENTS = {
    TEXT_ION_1_0: ION_VERSION_MARKER_EVENT,
}

_POS_INF = float('+inf')
_NEG_INF = float('-inf')
_NAN = float('nan')


def _ends_value(c):
    return c in _VALUE_TERMINATORS or BufferQueue.is_eof(c)


class _NullSequence:
    """Contains the terminal character sequence for the typed null suffix of the given IonType, starting with the first
    character after the one which disambiguated the type.

    For example, SYMBOL's _NullSequence contains the characters 'mbol' because 'null.s' is ambiguous until 'y' is found,
    at which point it must end in 'mbol'.

    Instances are used as leaves of the typed null prefix tree below.
    """
    def __init__(self, ion_type, sequence):
        self.ion_type = ion_type
        self.sequence = sequence

    def __getitem__(self, item):
        return self.sequence[item]

_NULL_SUFFIX = _NullSequence(IonType.NULL, _seq(b'ull'))
_NULL_SYMBOL_SUFFIX = _NullSequence(IonType.SYMBOL, _seq(b'mbol'))
_NULL_SEXP_SUFFIX = _NullSequence(IonType.SEXP, _seq(b'xp'))
_NULL_STRING_SUFFIX = _NullSequence(IonType.STRING, _seq(b'ng'))
_NULL_STRUCT_SUFFIX = _NullSequence(IonType.STRUCT, _seq(b'ct'))
_NULL_INT_SUFFIX = _NullSequence(IonType.INT, _seq(b'nt'))
_NULL_FLOAT_SUFFIX = _NullSequence(IonType.FLOAT, _seq(b'loat'))
_NULL_DECIMAL_SUFFIX = _NullSequence(IonType.DECIMAL, _seq(b'ecimal'))
_NULL_CLOB_SUFFIX = _NullSequence(IonType.CLOB, _seq(b'lob'))
_NULL_LIST_SUFFIX = _NullSequence(IonType.LIST, _seq(b'ist'))
_NULL_BLOB_SUFFIX = _NullSequence(IonType.BLOB, _seq(b'ob'))
_NULL_BOOL_SUFFIX = _NullSequence(IonType.BOOL, _seq(b'ol'))
_NULL_TIMESTAMP_SUFFIX = _NullSequence(IonType.TIMESTAMP, _seq(b'imestamp'))


# The following implements a prefix tree used to determine whether a typed null keyword has been found (see
# _typed_null_handler). The leaves of the tree (enumerated above) are the terminal character sequences for the 13
# possible suffixes to 'null.'. Any other suffix to 'null.' is an error. _NULL_STARTS is entered when 'null.' is found.

_NULL_STR_NEXT = {
    _ord(b'i'): _NULL_STRING_SUFFIX,
    _ord(b'u'): _NULL_STRUCT_SUFFIX
}

_NULL_ST_NEXT = {
    _ord(b'r'): _NULL_STR_NEXT
}

_NULL_S_NEXT = {
    _ord(b'y'): _NULL_SYMBOL_SUFFIX,
    _ord(b'e'): _NULL_SEXP_SUFFIX,
    _ord(b't'): _NULL_ST_NEXT
}

_NULL_B_NEXT = {
    _ord(b'l'): _NULL_BLOB_SUFFIX,
    _ord(b'o'): _NULL_BOOL_SUFFIX
}

_NULL_STARTS = {
    _ord(b'n'): _NULL_SUFFIX,  # null.null
    _ord(b's'): _NULL_S_NEXT,  # null.string, null.symbol, null.struct, null.sexp
    _ord(b'i'): _NULL_INT_SUFFIX,  # null.int
    _ord(b'f'): _NULL_FLOAT_SUFFIX,  # null.float
    _ord(b'd'): _NULL_DECIMAL_SUFFIX,  # null.decimal
    _ord(b'b'): _NULL_B_NEXT,  # null.bool, null.blob
    _ord(b'c'): _NULL_CLOB_SUFFIX,  # null.clob
    _ord(b'l'): _NULL_LIST_SUFFIX,  # null.list
    _ord(b't'): _NULL_TIMESTAMP_SUFFIX,  # null.timestamp
}


class _ContainerContext(record(
    'end', 'delimiter', 'ion_type', 'is_delimited'
)):
    """A description of an Ion container, including the container's IonType and its textual delimiter and end character,
    if applicable.

    This is tracked as part of the current token's context, and is useful when certain lexing decisions depend on
    which container the token is a member of. For example, ending a numeric token with ']' is not legal unless that
    token is contained in a list.

    Args:
        end (tuple): Tuple containing the container's end character, if any.
        delimiter (tuple): Tuple containing the container's delimiter character, if any.
        ion_type (Optional[IonType]): The container's IonType, if any.
        is_delimited (bool): True if delimiter is not empty; otherwise, False.
    """

_C_TOP_LEVEL = _ContainerContext((), (), None, False)
_C_STRUCT = _ContainerContext((_CLOSE_BRACE,), (_COMMA,), IonType.STRUCT, True)
_C_LIST = _ContainerContext((_CLOSE_BRACKET,), (_COMMA,), IonType.LIST, True)
_C_SEXP = _ContainerContext((_CLOSE_PAREN,), (), IonType.SEXP, False)


def _is_escaped(c):
    """Queries whether a character ordinal or code point was part of an escape sequence."""
    try:
        return c.is_escaped
    except AttributeError:
        return False


def _as_symbol(value, is_symbol_value=True):
    """Converts the input to a :class:`SymbolToken` suitable for being emitted as part of a :class:`IonEvent`.

    If the input has an `as_symbol` method (e.g. :class:`CodePointArray`), it will be converted using that method.
    Otherwise, it must already be a `SymbolToken`. In this case, there is nothing to do unless the input token is not a
    symbol value and it is an :class:`_IVMToken`. This requires the `_IVMToken` to be converted to a regular
    `SymbolToken`.
    """
    try:
        return value.as_symbol()
    except AttributeError:
        assert isinstance(value, SymbolToken)
    if not is_symbol_value:
        try:
            # This converts _IVMTokens to regular SymbolTokens when the _IVMToken cannot represent an IVM (i.e.
            # it is a field name or annotation).
            return value.regular_token()
        except AttributeError:
            pass
    return value


class _HandlerContext():
    """A context for a handler co-routine.

    Args:
        container (_ContainerContext): The description of the container in which this context is contained.
        queue (BufferQueue): The data source for the handler.
        field_name (Optional[SymbolToken]): The token representing the field name for the handled
            value.
        annotations (Optional[Sequence[SymbolToken]]): The sequence of annotations tokens
            for the value to be parsed.
        depth (int): the depth of the parser.
        whence (Coroutine): The reference to the co-routine that this handler should delegate
            back to when the handler is logically done.
        value (Optional[bytearray|CodePointArray]): The (in-progress) value of this context's token.
        ion_type (Optional[IonType]): The IonType of the current token.
        pending_symbol (Optional[bytearray|CodePointArray]): A pending symbol, which may end up being an annotation,
            field name, or symbol value.
        quoted_text (Optional[bool]): True if this context represents quoted text; otherwise, False.
        line_comment (Optional[bool]): True if this context represents a line comment; otherwise, False.
        code_point (Optional[int|CodePoint]): The token's current unicode code point, if applicable.
        is_self_delimiting (Optional[bool]): True if this context's token is self-delimiting (a short string, container,
            or comment).
        is_composite (Optional[bool]): True if this context's token is a value immediately followed by another token
            discovered during lookahead.
    """

    def __init__(self, container, queue, field_name, annotations, depth, whence, value, ion_type, pending_symbol,
                 quoted_text=False, line_comment=False, code_point=None, is_self_delimiting=False,
                 is_composite=False):
        self.container = container
        self.queue = queue
        self.field_name = field_name
        self.annotations = annotations
        self.depth = depth
        self.whence = whence
        self.value = value
        self.ion_type = ion_type
        self.pending_symbol = pending_symbol
        self.quoted_text = quoted_text
        self.line_comment = line_comment
        self.code_point = code_point
        self.is_self_delimiting = is_self_delimiting
        self.is_composite = is_composite

    def event_transition(self, event_cls, event_type, ion_type, value):
        """Returns an ion event event_transition that yields to another co-routine."""
        annotations = self.annotations or ()
        depth = self.depth
        whence = self.whence

        if ion_type is IonType.SYMBOL:
            if not annotations and depth == 0 and isinstance(value, _IVMToken):
                event = value.ivm_event()
                if event is None:
                    _illegal_character(None, self, 'Illegal IVM: %s.' % (value.text,))
                return Transition(event, whence)
            assert not isinstance(value, _IVMToken)

        return Transition(
            event_cls(event_type, ion_type, value, self.field_name, annotations, depth),
            whence
        )

    def immediate_transition(self, delegate):
        """Returns an immediate transition to another co-routine."""
        return Transition(None, delegate)

    def read_data_event(self, whence, complete=False, can_flush=False):
        """Creates a transition to a co-routine for retrieving data as bytes.

        Args:
            whence (Coroutine): The co-routine to return to after the data is satisfied.
            complete (Optional[bool]): True if STREAM_END should be emitted if no bytes are read or
                available; False if INCOMPLETE should be emitted in that case.
            can_flush (Optional[bool]): True if NEXT may be requested after INCOMPLETE is emitted as a result of this
                data request.
        """
        return Transition(None, _read_data_handler(whence, self, complete, can_flush))

    def next_code_point(self, whence):
        """Creates a co-routine for retrieving data as code points.

        This should be used in quoted string contexts.
        """
        return Transition(None, _next_code_point_handler(whence, self))

    def set_unicode(self, quoted_text=False):
        """Converts the context's ``value`` to a sequence of unicode code points for holding text tokens, indicating
        whether the text is quoted.
        """
        if isinstance(self.value, CodePointArray):
            assert self.quoted_text == quoted_text
            return self
        self.value = CodePointArray(self.value)
        self.quoted_text = quoted_text
        self.line_comment = False
        return self

    def set_quoted_text(self, quoted_text):
        """Sets the context's ``quoted_text`` flag. Useful when entering and exiting quoted text tokens."""
        self.quoted_text = quoted_text
        self.line_comment = False
        return self

    def set_self_delimiting(self, is_self_delimiting):
        """Sets the context's ``is_self_delimiting`` flag. Useful when the end of a self-delimiting token (short string,
        container, or comment) is reached.

        This is distinct from the ``quoted_text`` flag because some quoted text (quoted symbols and long strings) are
        not self-delimiting--they require lookahead to determine if they are complete.
        """
        self.is_self_delimiting = is_self_delimiting
        return self

    def set_code_point(self, code_point):
        """Sets the context's current ``code_point`` to the given ``int`` or :class:`CodePoint`."""
        self.code_point = code_point
        return self

    def derive_container_context(self, ion_type, whence):
        """Derives a container context as a child of the current context."""
        if ion_type is IonType.STRUCT:
            container = _C_STRUCT
        elif ion_type is IonType.LIST:
            container = _C_LIST
        elif ion_type is IonType.SEXP:
            container = _C_SEXP
        else:
            raise TypeError('Cannot derive container context for non-container type %s.' % (ion_type.name,))
        return _HandlerContext(
            container=container,
            queue=self.queue,
            field_name=self.field_name,
            annotations=self.annotations,
            depth=self.depth + 1,
            whence=whence,
            value=None,  # containers don't have a value
            ion_type=ion_type,
            pending_symbol=None
        )

    def set_empty_symbol(self):
        """Resets the context, retaining the fields that make it a child of its container (``container``, ``queue``,
        ``depth``, ``whence``), and sets an empty ``pending_symbol``.

        This is useful when an empty quoted symbol immediately follows a long string.
        """
        self.field_name = None
        self.annotations = None
        self.ion_type = None
        self.set_pending_symbol(CodePointArray())
        return self

    def derive_child_context(self, whence):
        """Derives a scalar context as a child of the current context."""
        return _HandlerContext(
            container=self.container,
            queue=self.queue,
            field_name=None,
            annotations=None,
            depth=self.depth,
            whence=whence,
            value=bytearray(),  # children start without a value
            ion_type=None,
            pending_symbol=None
        )

    def set_line_comment(self, is_line_comment=True):
        """Sets the context's ``line_comment`` flag. Useful when entering or exiting a line comment."""
        self.line_comment = is_line_comment
        return self

    def set_ion_type(self, ion_type):
        """Sets context to the given IonType."""
        if ion_type is self.ion_type:
            return self
        self.ion_type = ion_type
        self.line_comment = False
        return self

    def set_annotation(self):
        """Appends the context's ``pending_symbol`` to its ``annotations`` sequence."""
        assert self.pending_symbol is not None
        assert not self.value
        annotations = (_as_symbol(self.pending_symbol, is_symbol_value=False),)  # pending_symbol becomes an annotation
        self.annotations = annotations if not self.annotations else self.annotations + annotations
        self.ion_type = None
        self.pending_symbol = None  # reset pending symbol
        self.quoted_text = False
        self.line_comment = False
        self.is_self_delimiting = False
        return self

    def set_field_name(self):
        """Sets the context's ``pending_symbol`` as its ``field_name``."""
        assert self.pending_symbol is not None
        assert not self.value
        self.field_name = _as_symbol(self.pending_symbol, is_symbol_value=False)  # pending_symbol becomes field name
        self.pending_symbol = None  # reset pending symbol
        self.quoted_text = False
        self.line_comment = False
        self.is_self_delimiting = False
        return self

    def set_pending_symbol(self, pending_symbol=None):
        """Sets the context's ``pending_symbol`` with the given unicode sequence and resets the context's ``value``.

        If the input is None, an empty :class:`CodePointArray` is used.
        """
        if pending_symbol is None:
            pending_symbol = CodePointArray()
        self.value = bytearray()  # reset value
        self.pending_symbol = pending_symbol
        self.line_comment = False
        return self

    def set_composite(self, is_composite):
        self.is_composite = is_composite
        return self


class _CompositeTransition(Transition):
    """Composes an event transition followed by an immediate transition to the handler for the next token.

    This is useful when some lookahead is required to determine if a token has ended, e.g. in the case of long strings.

    Args:
        event_transition (Transition): A transition with a non-None IonEvent.
        current_context (_HandlerContext): The context for the value contained in ``event_transition``.
        next_handler (Coroutine): The handler that will lex the next token. Only None if ``next_context`` contains a
            complete token (as is the case with an empty quoted symbol following a long string).
        next_context (Optional[_HandlerContext]): The context for the next token. If None, a new child context
            will be derived from ``ctx``.
        initialize_handler (Optional[bool]): True if the ``next_handler`` coroutine needs to be initialized;
            otherwise, False.
    """
    def __new__(cls, event_transition, *args, **kwargs):
        return Transition.__new__(cls, event_transition.event, event_transition.delegate)

    def __init__(self, event_transition, current_context, next_handler, next_context=None, initialize_handler=True):
        assert event_transition.event is not None
        if next_context is None:
            next_context = current_context.derive_child_context(current_context.whence)
        next_transition = None
        if next_handler is not None:
            if initialize_handler:
                next_handler = next_handler(next_context)
            next_transition = next_context.immediate_transition(next_handler)
        current_context.set_composite(True)
        self.next_transition = next_transition
        self.next_context = next_context


def _decode(value):
    return value.decode(_ENCODING)


def _parse_number(parse_func, value, base=10):
    def parse():
        return parse_func(value, base)
    return parse


def _base_10(parse_func, value, base, decode=False):
    assert base == 10
    if decode:
        value = _decode(value)
    return parse_func(value)


def _base_n(parse_func, value, base):
    return parse_func(_decode(value), base)


# In Python 2, int() returns a long if the input overflows an int.
_parse_decimal_int = partial(_parse_number, partial(_base_10, int))
_parse_binary_int = partial(_parse_number, partial(_base_n, int), base=2)
_parse_hex_int = partial(_parse_number, partial(_base_n, int), base=16)
_parse_float = partial(_parse_number, partial(_base_10, float))
_parse_decimal = partial(_parse_number, partial(_base_10, Decimal, decode=True))


@coroutine
def _number_negative_start_handler(c, ctx):
    """Handles numeric values that start with a negative sign. Branches to delegate co-routines according to
    _NEGATIVE_TABLE.
    """
    assert c == _MINUS
    assert len(ctx.value) == 0
    ctx.set_ion_type(IonType.INT)
    ctx.value.append(c)
    c, _ = yield
    yield ctx.immediate_transition(_NEGATIVE_TABLE[c](c, ctx))


@coroutine
def _number_zero_start_handler(c, ctx):
    """Handles numeric values that start with zero or negative zero. Branches to delegate co-routines according to
    _ZERO_START_TABLE.
    """
    assert c == _ZERO
    assert len(ctx.value) == 0 or (len(ctx.value) == 1 and ctx.value[0] == _MINUS)
    ctx.set_ion_type(IonType.INT)
    ctx.value.append(c)
    c, _ = yield
    if _ends_value(c):
        trans = ctx.event_transition(IonThunkEvent, IonEventType.SCALAR, ctx.ion_type, _parse_decimal_int(ctx.value))
        if c == _SLASH:
            trans = ctx.immediate_transition(_number_slash_end_handler(c, ctx, trans))
        yield trans
    yield ctx.immediate_transition(_ZERO_START_TABLE[c](c, ctx))


@coroutine
def _number_or_timestamp_handler(c, ctx):
    """Handles numeric values that start with digits 1-9. May terminate a value, in which case that value is an
    int. If it does not terminate a value, it branches to delegate co-routines according to _NUMBER_OR_TIMESTAMP_TABLE.
    """
    assert c in _DIGITS
    ctx.set_ion_type(IonType.INT)  # If this is the last digit read, this value is an Int.
    val = ctx.value
    val.append(c)
    c, self = yield
    trans = ctx.immediate_transition(self)
    while True:
        if _ends_value(c):
            trans = ctx.event_transition(IonThunkEvent, IonEventType.SCALAR,
                                         ctx.ion_type, _parse_decimal_int(ctx.value))
            if c == _SLASH:
                trans = ctx.immediate_transition(_number_slash_end_handler(c, ctx, trans))
        else:
            if c not in _DIGITS:
                trans = ctx.immediate_transition(_NUMBER_OR_TIMESTAMP_TABLE[c](c, ctx))
            else:
                val.append(c)
        c, _ = yield trans


@coroutine
def _number_slash_end_handler(c, ctx, event):
    """Handles numeric values that end in a forward slash. This is only legal if the slash begins a comment; thus,
    this co-routine either results in an error being raised or an event being yielded.
    """
    assert c == _SLASH
    c, self = yield
    next_ctx = ctx.derive_child_context(ctx.whence)
    comment = _comment_handler(_SLASH, next_ctx, next_ctx.whence)
    comment.send((c, comment))
    # If the previous line returns without error, it's a valid comment and the number may be emitted.
    yield _CompositeTransition(event, ctx, comment, next_ctx, initialize_handler=False)


def _numeric_handler_factory(charset, transition, assertion, illegal_before_underscore, parse_func,
                             illegal_at_end=(None,), ion_type=None, append_first_if_not=None, first_char=None):
    """Generates a handler co-routine which tokenizes a numeric component (a token or sub-token).

    Args:
        charset (sequence): Set of ordinals of legal characters for this numeric component.
        transition (callable): Called upon termination of this component (i.e. when a character not in ``charset`` is
            found). Accepts the previous character ordinal, the current character ordinal, the current context, and the
            previous transition. Returns a Transition if the component ends legally; otherwise, raises an error.
        assertion (callable): Accepts the first character's ordinal and the current context. Returns True if this is
            a legal start to the component.
        illegal_before_underscore (sequence): Set of ordinals of illegal characters to precede an underscore for this
            component.
        parse_func (callable): Called upon ending the numeric value. Accepts the current token value and returns a
            thunk that lazily parses the token.
        illegal_at_end (Optional[sequence]): Set of ordinals of characters that may not legally end the value.
        ion_type (Optional[IonType]): The type of the value if it were to end on this component.
        append_first_if_not (Optional[int]): The ordinal of a character that should not be appended to the token if
            it occurs first in this component (e.g. an underscore in many cases).
        first_char (Optional[int]): The ordinal of the character that should be appended instead of the character that
            occurs first in this component. This is useful for preparing the token for parsing in the case where a
            particular character is peculiar to the Ion format (e.g. 'd' to denote the exponent of a decimal value
            should be replaced with 'e' for compatibility with python's Decimal type).
    """
    @coroutine
    def numeric_handler(c, ctx):
        assert assertion(c, ctx)
        if ion_type is not None:
            ctx.set_ion_type(ion_type)
        val = ctx.value
        if c != append_first_if_not:
            first = c if first_char is None else first_char
            val.append(first)
        prev = c
        c, self = yield
        trans = ctx.immediate_transition(self)
        while True:
            if _ends_value(c):
                if prev == _UNDERSCORE or prev in illegal_at_end:
                    _illegal_character(c, ctx, '%s at end of number.' % (_chr(prev),))
                trans = ctx.event_transition(IonThunkEvent, IonEventType.SCALAR, ctx.ion_type, parse_func(ctx.value))
                if c == _SLASH:
                    trans = ctx.immediate_transition(_number_slash_end_handler(c, ctx, trans))
            else:
                if c == _UNDERSCORE:
                    if prev == _UNDERSCORE or prev in illegal_before_underscore:
                        _illegal_character(c, ctx, 'Underscore after %s.' % (_chr(prev),))
                else:
                    if c not in charset:
                        trans = transition(prev, c, ctx, trans)
                    else:
                        val.append(c)
            prev = c
            c, _ = yield trans
    return numeric_handler


def _exponent_handler_factory(ion_type, exp_chars, parse_func, first_char=None):
    """Generates a handler co-routine which tokenizes an numeric exponent.

    Args:
        ion_type (IonType): The type of the value with this exponent.
        exp_chars (sequence): The set of ordinals of the legal exponent characters for this component.
        parse_func (callable): Called upon ending the numeric value. Accepts the current token value and returns a
            thunk that lazily parses the token.
        first_char (Optional[int]): The ordinal of the character that should be appended instead of the character that
            occurs first in this component. This is useful for preparing the token for parsing in the case where a
            particular character is peculiar to the Ion format (e.g. 'd' to denote the exponent of a decimal value
            should be replaced with 'e' for compatibility with python's Decimal type).
    """
    def transition(prev, c, ctx, trans):
        if c in _SIGN and prev in exp_chars:
            ctx.value.append(c)
        else:
            _illegal_character(c, ctx)
        return trans
    illegal = exp_chars + _SIGN
    return _numeric_handler_factory(_DIGITS, transition, lambda c, ctx: c in exp_chars, illegal, parse_func,
                                    illegal_at_end=illegal, ion_type=ion_type, first_char=first_char)


_decimal_handler = _exponent_handler_factory(IonType.DECIMAL, _DECIMAL_EXPS, _parse_decimal, first_char=_ord(b'e'))
_float_handler = _exponent_handler_factory(IonType.FLOAT, _FLOAT_EXPS, _parse_float)


def _coefficient_handler_factory(trans_table, parse_func, assertion=lambda c, ctx: True,
                                 ion_type=None, append_first_if_not=None):
    """Generates a handler co-routine which tokenizes a numeric coefficient.

    Args:
        trans_table (dict): lookup table for the handler for the next component of this numeric token, given the
            ordinal of the first character in that component.
        parse_func (callable): Called upon ending the numeric value. Accepts the current token value and returns a
            thunk that lazily parses the token.
        assertion (callable): Accepts the first character's ordinal and the current context. Returns True if this is
            a legal start to the component.
        ion_type (Optional[IonType]): The type of the value if it were to end on this coefficient.
        append_first_if_not (Optional[int]): The ordinal of a character that should not be appended to the token if
            it occurs first in this component (e.g. an underscore in many cases).
    """
    def transition(prev, c, ctx, trans):
        if prev == _UNDERSCORE:
            _illegal_character(c, ctx, 'Underscore before %s.' % (_chr(c),))
        return ctx.immediate_transition(trans_table[c](c, ctx))
    return _numeric_handler_factory(_DIGITS, transition, assertion, (_DOT,), parse_func,
                                    ion_type=ion_type, append_first_if_not=append_first_if_not)


_FRACTIONAL_NUMBER_TABLE = _defaultdict(
    _merge_mappings(
        (_DECIMAL_EXPS, _decimal_handler),
        (_FLOAT_EXPS, _float_handler)
    )
)

fractional_number_handler = _coefficient_handler_factory(
    _FRACTIONAL_NUMBER_TABLE, _parse_decimal, assertion=lambda c, ctx: c == _DOT, ion_type=IonType.DECIMAL)

_WHOLE_NUMBER_TABLE = _defaultdict(
    _merge_mappings(
        {
            _DOT: fractional_number_handler,
        },
        _FRACTIONAL_NUMBER_TABLE
    )
)

_whole_number_handler = _coefficient_handler_factory(_WHOLE_NUMBER_TABLE, _parse_decimal_int,
                                                     append_first_if_not=_UNDERSCORE)


def _radix_int_handler_factory(radix_indicators, charset, parse_func):
    """Generates a handler co-routine which tokenizes a integer of a particular radix.

    Args:
        radix_indicators (sequence): The set of ordinals of characters that indicate the radix of this int.
        charset (sequence): Set of ordinals of legal characters for this radix.
        parse_func (callable): Called upon ending the numeric value. Accepts the current token value and returns a
            thunk that lazily parses the token.
    """
    def assertion(c, ctx):
        return c in radix_indicators and \
               ((len(ctx.value) == 1 and ctx.value[0] == _ZERO) or
                (len(ctx.value) == 2 and ctx.value[0] == _MINUS and ctx.value[1] == _ZERO)) and \
               ctx.ion_type == IonType.INT
    return _numeric_handler_factory(charset, lambda prev, c, ctx, trans: _illegal_character(c, ctx),
                                    assertion, radix_indicators, parse_func, illegal_at_end=radix_indicators)


_binary_int_handler = _radix_int_handler_factory(_BINARY_RADIX, _BINARY_DIGITS, _parse_binary_int)
_hex_int_handler = _radix_int_handler_factory(_HEX_RADIX, _HEX_DIGITS, _parse_hex_int)


@coroutine
def _timestamp_zero_start_handler(c, ctx):
    """Handles numeric values that start with a zero followed by another digit. This is either a timestamp or an
    error.
    """
    val = ctx.value
    ctx.set_ion_type(IonType.TIMESTAMP)
    if val[0] == _MINUS:
        _illegal_character(c, ctx, 'Negative year not allowed.')
    val.append(c)
    c, self = yield
    trans = ctx.immediate_transition(self)
    while True:
        if c in _TIMESTAMP_YEAR_DELIMITERS:
            trans = ctx.immediate_transition(_timestamp_handler(c, ctx))
        elif c in _DIGITS:
            val.append(c)
        else:
            _illegal_character(c, ctx)
        c, _ = yield trans


class _TimestampState(Enum):
    YEAR = 0
    MONTH = 1
    DAY = 2
    HOUR = 3
    MINUTE = 4
    SECOND = 5
    FRACTIONAL = 6
    OFF_HOUR = 7
    OFF_MINUTE = 8


class _TimestampTokens:
    """Holds the individual numeric tokens (as strings) that compose a `Timestamp`."""
    def __init__(self, year=None):
        fld = []
        for i in iter(_TimestampState):
            fld.append(None)
        if year is not None:
            fld[_TimestampState.YEAR] = year
        self._fields = fld

    def transition(self, state):
        val = bytearray()
        self._fields[state] = val
        return val

    def __getitem__(self, item):
        return self._fields[item]


_ZEROS = [
    b'',
    b'0',
    b'00',
    b'000',
    b'0000',
    b'00000'
]


def _parse_timestamp(tokens):
    """Parses each token in the given `_TimestampTokens` and marshals the numeric components into a `Timestamp`."""
    def parse():
        precision = TimestampPrecision.YEAR
        off_hour = tokens[_TimestampState.OFF_HOUR]
        off_minutes = tokens[_TimestampState.OFF_MINUTE]
        fraction = None
        if off_hour is not None:
            assert off_minutes is not None
            off_sign = -1 if _MINUS in off_hour else 1
            off_hour = int(off_hour)
            off_minutes = int(off_minutes) * off_sign
            if off_sign == -1 and off_hour == 0 and off_minutes == 0:
                # -00:00 (unknown UTC offset) is a naive datetime.
                off_hour = None
                off_minutes = None
        else:
            assert off_minutes is None

        year = tokens[_TimestampState.YEAR]
        assert year is not None
        year = int(year)

        month = tokens[_TimestampState.MONTH]
        if month is None:
            month = 1
        else:
            month = int(month)
            precision = TimestampPrecision.MONTH

        day = tokens[_TimestampState.DAY]
        if day is None:
            day = 1
        else:
            day = int(day)
            precision = TimestampPrecision.DAY

        hour = tokens[_TimestampState.HOUR]
        minute = tokens[_TimestampState.MINUTE]
        if hour is None:
            assert minute is None
            hour = 0
            minute = 0
        else:
            assert minute is not None
            hour = int(hour)
            minute = int(minute)
            precision = TimestampPrecision.MINUTE

        second = tokens[_TimestampState.SECOND]
        if second is None:
            second = 0
        else:
            second = int(second)
            precision = TimestampPrecision.SECOND

            fraction = tokens[_TimestampState.FRACTIONAL]
            if fraction is not None:
                fraction = Decimal(int(fraction)).scaleb(-1 * len(fraction))
        return timestamp(
            year, month, day,
            hour, minute, second, None,
            off_hour, off_minutes,
            precision=precision, fractional_precision=None, fractional_seconds=fraction
        )
    return parse


@coroutine
def _timestamp_handler(c, ctx):
    """Handles timestamp values. Entered after the year component has been completed; tokenizes the remaining
    components.
    """
    assert c in _TIMESTAMP_YEAR_DELIMITERS
    ctx.set_ion_type(IonType.TIMESTAMP)
    if len(ctx.value) != 4:
        _illegal_character(c, ctx, 'Timestamp year is %d digits; expected 4.' % (len(ctx.value),))
    prev = c
    c, self = yield
    trans = ctx.immediate_transition(self)
    state = _TimestampState.YEAR
    nxt = _DIGITS
    tokens = _TimestampTokens(ctx.value)
    val = None
    can_terminate = False
    if prev == _T:
        nxt += _VALUE_TERMINATORS
        can_terminate = True
    while True:
        is_eof = can_terminate and BufferQueue.is_eof(c)
        if c not in nxt and not is_eof:
            _illegal_character(c, ctx, 'Expected %r in state %r.' % ([_chr(x) for x in nxt], state))
        if c in _VALUE_TERMINATORS or is_eof:
            if not can_terminate:
                _illegal_character(c, ctx, 'Unexpected termination of timestamp.')
            trans = ctx.event_transition(IonThunkEvent, IonEventType.SCALAR, ctx.ion_type, _parse_timestamp(tokens))
            if c == _SLASH:
                trans = ctx.immediate_transition(_number_slash_end_handler(c, ctx, trans))
        else:
            can_terminate = False
            if c == _Z:
                # Z implies UTC, i.e. +00:00 local offset.
                tokens.transition(_TimestampState.OFF_HOUR).append(_ZERO)
                tokens.transition(_TimestampState.OFF_MINUTE).append(_ZERO)
                nxt = _VALUE_TERMINATORS
                can_terminate = True
            elif c == _T:
                nxt = _VALUE_TERMINATORS + _DIGITS
                can_terminate = True
            elif c in _TIMESTAMP_DELIMITERS:
                nxt = _DIGITS
            elif c in _DIGITS:
                if prev == _PLUS or (state > _TimestampState.MONTH and prev == _HYPHEN):
                    state = _TimestampState.OFF_HOUR
                    val = tokens.transition(state)
                    if prev == _HYPHEN:
                        val.append(prev)
                elif prev in (_TIMESTAMP_DELIMITERS + (_T,)):
                    state = _TimestampState[state + 1]
                    val = tokens.transition(state)
                    if state == _TimestampState.FRACTIONAL:
                        nxt = _DIGITS + _TIMESTAMP_OFFSET_INDICATORS
                elif prev in _DIGITS:
                    if state == _TimestampState.MONTH:
                        nxt = _TIMESTAMP_YEAR_DELIMITERS
                    elif state == _TimestampState.DAY:
                        nxt = (_T,) + _VALUE_TERMINATORS
                        can_terminate = True
                    elif state == _TimestampState.HOUR:
                        nxt = (_COLON,)
                    elif state == _TimestampState.MINUTE:
                        nxt = _TIMESTAMP_OFFSET_INDICATORS + (_COLON,)
                    elif state == _TimestampState.SECOND:
                        nxt = _TIMESTAMP_OFFSET_INDICATORS + (_DOT,)
                    elif state == _TimestampState.FRACTIONAL:
                        nxt = _DIGITS + _TIMESTAMP_OFFSET_INDICATORS
                    elif state == _TimestampState.OFF_HOUR:
                        nxt = (_COLON,)
                    elif state == _TimestampState.OFF_MINUTE:
                        nxt = _VALUE_TERMINATORS
                        can_terminate = True
                    else:
                        raise ValueError('Unknown timestamp state %r.' % (state,))
                else:
                    # Reaching this branch would be indicative of a programming error within this state machine.
                    raise ValueError('Digit following %s in timestamp state %r.' % (_chr(prev), state))
                val.append(c)
        prev = c
        c, _ = yield trans


@coroutine
def _comment_handler(c, ctx, whence):
    """Handles comments. Upon completion of the comment, immediately transitions back to `whence`."""
    assert c == _SLASH
    c, self = yield
    if c == _SLASH:
        ctx.set_line_comment()
        block_comment = False
    elif c == _ASTERISK:
        if ctx.line_comment:
            # This happens when a block comment immediately follows a line comment.
            ctx.set_line_comment(False)
        block_comment = True
    else:
        _illegal_character(c, ctx, 'Illegal character sequence "/%s".' % (_chr(c),))
    done = False
    prev = None
    trans = ctx.immediate_transition(self)
    while not done:
        c, _ = yield trans
        if block_comment:
            if prev == _ASTERISK and c == _SLASH:
                done = True
            prev = c
        else:
            if c in _NEWLINES or BufferQueue.is_eof(c):
                done = True
    yield ctx.set_self_delimiting(True).immediate_transition(whence)


@coroutine
def _sexp_slash_handler(c, ctx, whence=None, pending_event=None):
    """Handles the special case of a forward-slash within an s-expression. This is either an operator or a
    comment.
    """
    assert c == _SLASH
    if whence is None:
        whence = ctx.whence
    c, self = yield
    ctx.queue.unread(c)
    if c == _ASTERISK or c == _SLASH:
        yield ctx.immediate_transition(_comment_handler(_SLASH, ctx, whence))
    else:
        if pending_event is not None:
            # Since this is the start of a new value and not a comment, the pending event must be emitted.
            assert pending_event.event is not None
            yield _CompositeTransition(pending_event, ctx, partial(_operator_symbol_handler, _SLASH))
        yield ctx.immediate_transition(_operator_symbol_handler(_SLASH, ctx))


_SINGLE_QUOTES = [
    b"",
    b"'",
    b"''"
]


def _validate_quoted_text(allowed_whitespace, c, ctx, max_char):
    if c not in allowed_whitespace and not _is_escaped(c) and \
            (c < _MIN_QUOTED_CHAR or c > max_char):
        _illegal_character(c, ctx, 'Character out of range [%d, %d] for this type.'
                           % (_MIN_QUOTED_CHAR, max_char,))

_validate_long_string_text = partial(_validate_quoted_text, _WHITESPACE)


def _is_escaped_newline(c):
    if not (c in _NEWLINES and _is_escaped(c)):
        return False
    try:
        return c.char == _ESCAPED_NEWLINE
    except AttributeError:
        return False
    #return c in _NEWLINES and _is_escaped(c) and _chr(c) == u''


@coroutine
def _long_string_handler(c, ctx, is_field_name=False):
    """Handles triple-quoted strings. Remains active until a value other than a long string is encountered."""
    assert c == _SINGLE_QUOTE
    is_clob = ctx.ion_type is IonType.CLOB
    max_char = _MAX_CLOB_CHAR if is_clob else _MAX_TEXT_CHAR
    assert not (is_clob and is_field_name)
    if not is_clob and not is_field_name:
        ctx.set_ion_type(IonType.STRING)
    assert not ctx.value
    ctx.set_unicode(quoted_text=True)
    val = ctx.value
    if is_field_name:
        assert not val
        ctx.set_pending_symbol()
        val = ctx.pending_symbol
    quotes = 0
    in_data = True
    c, self = yield
    here = ctx.immediate_transition(self)
    trans = here
    while True:
        if c == _SINGLE_QUOTE and not _is_escaped(c):
            quotes += 1
            if quotes == 3:
                in_data = not in_data
                ctx.set_quoted_text(in_data)
                quotes = 0
        else:
            if in_data:
                _validate_long_string_text(c, ctx, max_char)
                # Any quotes found in the meantime are part of the data
                val.extend(_SINGLE_QUOTES[quotes])
                if not _is_escaped_newline(c):
                    val.append(c)
                quotes = 0
            else:
                if quotes > 0:
                    assert quotes < 3
                    if is_field_name or is_clob:
                        # There are at least two values here, which is illegal for field names or within clobs.
                        _illegal_character(c, ctx, 'Malformed triple-quoted text: %s' % (val,))
                    else:
                        # This string value is followed by a quoted symbol.
                        if ctx.container.is_delimited:
                            _illegal_character(c, ctx, 'Delimiter %s not found after value.'
                                               % (_chr(ctx.container.delimiter[0]),))
                        trans = ctx.event_transition(IonEvent, IonEventType.SCALAR, ctx.ion_type, ctx.value.as_text())
                        if quotes == 1:
                            if BufferQueue.is_eof(c):
                                _illegal_character(c, ctx, "Unexpected EOF.")
                            # c was read as a single byte. Re-read it as a code point.
                            ctx.queue.unread(c)
                            ctx.set_quoted_text(True)
                            c, _ = yield ctx.immediate_transition(self)
                            trans = _CompositeTransition(
                                trans,
                                ctx,
                                partial(_quoted_symbol_handler, c, is_field_name=False),
                            )
                        else:  # quotes == 2
                            trans = _CompositeTransition(trans, ctx, None, ctx.set_empty_symbol())
                elif c not in _WHITESPACE:
                    if is_clob:
                        trans = ctx.immediate_transition(_clob_end_handler(c, ctx))
                    elif c == _SLASH:
                        if ctx.container.ion_type is IonType.SEXP:
                            pending = ctx.event_transition(IonEvent, IonEventType.SCALAR,
                                                           ctx.ion_type, ctx.value.as_text())
                            trans = ctx.immediate_transition(_sexp_slash_handler(c, ctx, self, pending))
                        else:
                            trans = ctx.immediate_transition(_comment_handler(c, ctx, self))
                    elif is_field_name:
                        if c != _COLON:
                            _illegal_character(c, ctx, 'Illegal character after field name %s.' % (val,))
                        trans = ctx.immediate_transition(ctx.whence)
                    else:
                        trans = ctx.event_transition(IonEvent, IonEventType.SCALAR, ctx.ion_type, ctx.value.as_text())
        c, _ = yield trans
        ctx.set_self_delimiting(False)  # If comments separated long string components, this would have been set.
        trans = here


@coroutine
def _typed_null_handler(c, ctx):
    """Handles typed null values. Entered once `null.` has been found."""
    assert c == _DOT
    c, self = yield
    nxt = _NULL_STARTS
    i = 0
    length = None
    done = False
    trans = ctx.immediate_transition(self)
    while True:
        if done:
            if _ends_value(c) or (ctx.container.ion_type is IonType.SEXP and c in _OPERATORS):
                trans = ctx.event_transition(IonEvent, IonEventType.SCALAR, nxt.ion_type, None)
            else:
                _illegal_character(c, ctx, 'Illegal null type.')
        elif length is None:
            if c not in nxt:
                _illegal_character(c, ctx, 'Illegal null type.')
            nxt = nxt[c]
            if isinstance(nxt, _NullSequence):
                length = len(nxt.sequence)
        else:
            if c != nxt[i]:
                _illegal_character(c, ctx, 'Illegal null type.')
            i += 1
            done = i == length
        c, _ = yield trans


@coroutine
def _symbol_or_keyword_handler(c, ctx, is_field_name=False):
    """Handles the start of an unquoted text token.

    This may be an operator (if in an s-expression), an identifier symbol, or a keyword.
    """
    in_sexp = ctx.container.ion_type is IonType.SEXP
    if c not in _IDENTIFIER_STARTS:
        if in_sexp and c in _OPERATORS:
            c_next, _ = yield
            ctx.queue.unread(c_next)
            yield ctx.immediate_transition(_operator_symbol_handler(c, ctx))
        _illegal_character(c, ctx)
    assert not ctx.value
    ctx.set_unicode().set_ion_type(IonType.SYMBOL)
    val = ctx.value
    val.append(c)
    maybe_null = c == _N_LOWER
    maybe_nan = maybe_null
    maybe_true = c == _T_LOWER
    maybe_false = c == _F_LOWER
    c, self = yield
    trans = ctx.immediate_transition(self)
    keyword_trans = None
    match_index = 0
    while True:
        def check_keyword(name, keyword_sequence, ion_type, value, match_transition=lambda: None):
            maybe_keyword = True
            transition = None
            if match_index < len(keyword_sequence):
                maybe_keyword = c == keyword_sequence[match_index]
            else:
                transition = match_transition()
                if transition is not None:
                    pass
                elif _ends_value(c):
                    if is_field_name:
                        _illegal_character(c, ctx, '%s keyword as field name not allowed.' % (name,))
                    transition = ctx.event_transition(IonEvent, IonEventType.SCALAR, ion_type, value)
                elif c == _COLON:
                    message = ''
                    if is_field_name:
                        message = '%s keyword as field name not allowed.' % (name,)
                    _illegal_character(c, ctx, message)
                elif in_sexp and c in _OPERATORS:
                    transition = ctx.event_transition(IonEvent, IonEventType.SCALAR, ion_type, value)
                else:
                    maybe_keyword = False
            return maybe_keyword, transition
        if maybe_null:
            def check_null_dot():
                transition = None
                found = c == _DOT
                if found:
                    if is_field_name:
                        _illegal_character(c, ctx, "Illegal character in field name.")
                    transition = ctx.immediate_transition(_typed_null_handler(c, ctx))
                return transition
            maybe_null, keyword_trans = check_keyword('null', _NULL_SUFFIX.sequence,
                                                      IonType.NULL, None, check_null_dot)
        if maybe_nan:
            maybe_nan, keyword_trans = check_keyword('nan', _NAN_SUFFIX, IonType.FLOAT, _NAN)
        elif maybe_true:
            maybe_true, keyword_trans = check_keyword('true', _TRUE_SUFFIX, IonType.BOOL, True)
        elif maybe_false:
            maybe_false, keyword_trans = check_keyword('false', _FALSE_SUFFIX, IonType.BOOL, False)
        if maybe_null or maybe_nan or maybe_true or maybe_false:
            if keyword_trans is not None:
                trans = keyword_trans
            else:
                val.append(c)
                match_index += 1
        else:
            if c in _SYMBOL_TOKEN_TERMINATORS:
                # This might be an annotation or a field name
                ctx.set_pending_symbol(val)
                trans = ctx.immediate_transition(ctx.whence)
            elif _ends_value(c) or (in_sexp and c in _OPERATORS):
                trans = ctx.event_transition(IonEvent, IonEventType.SCALAR, IonType.SYMBOL, val.as_symbol())
            else:
                trans = ctx.immediate_transition(_unquoted_symbol_handler(c, ctx, is_field_name=is_field_name))
        c, _ = yield trans


def _inf_or_operator_handler_factory(c_start, is_delegate=True):
    """Generates handler co-routines for values that may be `+inf` or `-inf`.

    Args:
        c_start (int): The ordinal of the character that starts this token (either `+` or `-`).
        is_delegate (bool): True if a different handler began processing this token; otherwise, False. This will only
            be true for `-inf`, because it is not the only value that can start with `-`; `+inf` is the only value
            (outside of a s-expression) that can start with `+`.
    """
    @coroutine
    def inf_or_operator_handler(c, ctx):
        next_ctx = None
        if not is_delegate:
            ctx.value.append(c_start)
            c, self = yield
        else:
            assert ctx.value[0] == c_start
            assert c not in _DIGITS
            ctx.queue.unread(c)
            next_ctx = ctx
            _, self = yield
            assert c == _
        maybe_inf = True
        ctx.set_ion_type(IonType.FLOAT)
        match_index = 0
        trans = ctx.immediate_transition(self)
        while True:
            if maybe_inf:
                if match_index < len(_INF_SUFFIX):
                    maybe_inf = c == _INF_SUFFIX[match_index]
                else:
                    if _ends_value(c) or (ctx.container.ion_type is IonType.SEXP and c in _OPERATORS):
                        yield ctx.event_transition(
                            IonEvent, IonEventType.SCALAR, IonType.FLOAT, c_start == _MINUS and _NEG_INF or _POS_INF
                        )
                    else:
                        maybe_inf = False
            if maybe_inf:
                match_index += 1
            else:
                ctx.set_unicode()
                if match_index > 0:
                    next_ctx = ctx.derive_child_context(ctx.whence)
                    for ch in _INF_SUFFIX[0:match_index]:
                        next_ctx.value.append(ch)
                break
            c, self = yield trans
        if ctx.container is not _C_SEXP:
            _illegal_character(c, next_ctx is None and ctx or next_ctx,
                               'Illegal character following %s.' % (_chr(c_start),))
        if match_index == 0:
            if c in _OPERATORS:
                yield ctx.immediate_transition(_operator_symbol_handler(c, ctx))
            yield ctx.event_transition(IonEvent, IonEventType.SCALAR, IonType.SYMBOL, ctx.value.as_symbol())
        yield _CompositeTransition(
            ctx.event_transition(IonEvent, IonEventType.SCALAR, IonType.SYMBOL, ctx.value.as_symbol()),
            ctx,
            partial(_unquoted_symbol_handler, c),
            next_ctx
        )
    return inf_or_operator_handler


_negative_inf_or_sexp_hyphen_handler = _inf_or_operator_handler_factory(_MINUS)
_positive_inf_or_sexp_plus_handler = _inf_or_operator_handler_factory(_PLUS, is_delegate=False)


@coroutine
def _operator_symbol_handler(c, ctx):
    """Handles operator symbol values within s-expressions."""
    assert c in _OPERATORS
    ctx.set_unicode()
    val = ctx.value
    val.append(c)
    c, self = yield
    trans = ctx.immediate_transition(self)
    while c in _OPERATORS:
        val.append(c)
        c, _ = yield trans
    yield ctx.event_transition(IonEvent, IonEventType.SCALAR, IonType.SYMBOL, val.as_symbol())


def _symbol_token_end(c, ctx, is_field_name, value=None):
    """Returns a transition which ends the current symbol token."""
    if value is None:
        value = ctx.value
    if is_field_name or c in _SYMBOL_TOKEN_TERMINATORS or ctx.quoted_text:
        # This might be an annotation or a field name. Mark it as self-delimiting because a symbol token termination
        # character has been found.
        ctx.set_self_delimiting(ctx.quoted_text).set_pending_symbol(value).set_quoted_text(False)
        trans = ctx.immediate_transition(ctx.whence)
    else:
        trans = ctx.event_transition(IonEvent, IonEventType.SCALAR, IonType.SYMBOL, _as_symbol(value))
    return trans


@coroutine
def _unquoted_symbol_handler(c, ctx, is_field_name=False):
    """Handles identifier symbol tokens. If in an s-expression, these may be followed without whitespace by
    operators.
    """
    in_sexp = ctx.container.ion_type is IonType.SEXP
    ctx.set_unicode()
    if c not in _IDENTIFIER_CHARACTERS:
        if in_sexp and c in _OPERATORS:
            c_next, _ = yield
            ctx.queue.unread(c_next)
            assert ctx.value
            yield _CompositeTransition(
                ctx.event_transition(IonEvent, IonEventType.SCALAR, IonType.SYMBOL, ctx.value.as_symbol()),
                ctx,
                partial(_operator_symbol_handler, c)
            )
        _illegal_character(c, ctx.set_ion_type(IonType.SYMBOL))
    val = ctx.value
    val.append(c)
    prev = c
    c, self = yield
    trans = ctx.immediate_transition(self)
    while True:
        if c not in _WHITESPACE:
            if prev in _WHITESPACE or _ends_value(c) or c == _COLON or (in_sexp and c in _OPERATORS):
                break
            if c not in _IDENTIFIER_CHARACTERS:
                _illegal_character(c, ctx.set_ion_type(IonType.SYMBOL))
            val.append(c)
        prev = c
        c, _ = yield trans
    yield _symbol_token_end(c, ctx, is_field_name)


class _IVMToken(SymbolToken):
    """Subclass of :class:`SymbolToken`, which indicates that this token's text matches the IVM pattern."""
    def ivm_event(self):
        """If this token's text is a supported IVM, returns the :class:`IonEvent` representing that IVM.
        Otherwise, returns `None`.
        """
        try:
            return _IVM_EVENTS[self.text]
        except KeyError:
            return None

    def regular_token(self):
        """Returns a copy of this token as a normal :class:`SymbolToken`.

        This will be used in _as_symbol when this token is used as an annotation or field name, in which cases it
        can no longer be an IVM.
        """
        return SymbolToken(self.text, self.sid, self.location)


@coroutine
def _symbol_identifier_or_unquoted_symbol_handler(c, ctx, is_field_name=False):
    """Handles symbol tokens that begin with a dollar sign. These may end up being system symbols ($ion_*), symbol
    identifiers ('$' DIGITS+), or regular unquoted symbols.
    """
    assert c == _DOLLAR_SIGN
    in_sexp = ctx.container.ion_type is IonType.SEXP
    ctx.set_unicode().set_ion_type(IonType.SYMBOL)
    val = ctx.value
    val.append(c)
    prev = c
    c, self = yield
    trans = ctx.immediate_transition(self)
    maybe_ivm = ctx.depth == 0 and not is_field_name and not ctx.annotations
    complete_ivm = False
    maybe_symbol_identifier = True
    match_index = 1
    ivm_post_underscore = False
    while True:
        if c not in _WHITESPACE:
            if prev in _WHITESPACE or _ends_value(c) or c == _COLON or (in_sexp and c in _OPERATORS):
                break
            maybe_symbol_identifier = maybe_symbol_identifier and c in _DIGITS
            if maybe_ivm:
                if match_index == len(_IVM_PREFIX):
                    if c in _DIGITS:
                        if ivm_post_underscore:
                            complete_ivm = True
                    elif c == _UNDERSCORE and not ivm_post_underscore:
                        ivm_post_underscore = True
                    else:
                        maybe_ivm = False
                        complete_ivm = False
                else:
                    maybe_ivm = c == _IVM_PREFIX[match_index]
            if maybe_ivm:
                if match_index < len(_IVM_PREFIX):
                    match_index += 1
            elif not maybe_symbol_identifier:
                yield ctx.immediate_transition(_unquoted_symbol_handler(c, ctx, is_field_name))
            val.append(c)
        elif match_index < len(_IVM_PREFIX):
            maybe_ivm = False
        prev = c
        c, _ = yield trans
    if len(val) == 1:
        assert val[0] == _chr(_DOLLAR_SIGN)
    elif maybe_symbol_identifier:
        assert not maybe_ivm
        sid = int(val[1:])
        val = SymbolToken(None, sid)
    elif complete_ivm:
        val = _IVMToken(*val.as_symbol())
    yield _symbol_token_end(c, ctx, is_field_name, value=val)


_validate_short_quoted_text = partial(_validate_quoted_text, _WHITESPACE_NOT_NL)


def _quoted_text_handler_factory(delimiter, assertion, before, after, append_first=True,
                                 on_close=lambda ctx: None):
    """Generates handlers for quoted text tokens (either short strings or quoted symbols).

    Args:
        delimiter (int): Ordinal of the quoted text's delimiter.
        assertion (callable): Accepts the first character's ordinal, returning True if that character is a legal
            beginning to the token.
        before (callable): Called upon initialization. Accepts the first character's ordinal, the current context, True
            if the token is a field name, and True if the token is a clob; returns the token's current value and True
            if ``on_close`` should be called upon termination of the token.
        after (callable): Called after termination of the token. Accepts the final character's ordinal, the current
            context, and True if the token is a field name; returns a Transition.
        append_first (Optional[bool]): True if the first character the coroutine receives is part of the text data, and
            should therefore be appended to the value; otherwise, False (in which case, the first character must be
            the delimiter).
        on_close (Optional[callable]): Called upon termination of the token (before ``after``), if ``before`` indicated
            that ``on_close`` should be called. Accepts the current context and returns a Transition. This is useful
            for yielding a different kind of Transition based on initialization parameters given to ``before`` (e.g.
            string vs. clob).
    """
    @coroutine
    def quoted_text_handler(c, ctx, is_field_name=False):
        assert assertion(c)

        def append():
            if not _is_escaped_newline(c):
                val.append(c)
        is_clob = ctx.ion_type is IonType.CLOB
        max_char = _MAX_CLOB_CHAR if is_clob else _MAX_TEXT_CHAR
        ctx.set_unicode(quoted_text=True)
        val, event_on_close = before(c, ctx, is_field_name, is_clob)
        if append_first:
            append()
        c, self = yield
        trans = ctx.immediate_transition(self)
        done = False
        while not done:
            if c == delimiter and not _is_escaped(c):
                done = True
                if event_on_close:
                    trans = on_close(ctx)
                else:
                    break
            else:
                _validate_short_quoted_text(c, ctx, max_char)
                append()
            c, _ = yield trans
        yield after(c, ctx, is_field_name)
    return quoted_text_handler


def _short_string_handler_factory():
    """Generates the short string (double quoted) handler."""
    def before(c, ctx, is_field_name, is_clob):
        assert not (is_clob and is_field_name)
        is_string = not is_clob and not is_field_name
        if is_string:
            ctx.set_ion_type(IonType.STRING)
        val = ctx.value
        if is_field_name:
            assert not val
            ctx.set_pending_symbol()
            val = ctx.pending_symbol
        return val, is_string

    def on_close(ctx):
        ctx.set_self_delimiting(True)
        return ctx.event_transition(IonEvent, IonEventType.SCALAR, ctx.ion_type, ctx.value.as_text())

    def after(c, ctx, is_field_name):
        ctx.set_quoted_text(False).set_self_delimiting(True)
        return ctx.immediate_transition(
            ctx.whence if is_field_name else _clob_end_handler(c, ctx),
        )

    return _quoted_text_handler_factory(_DOUBLE_QUOTE, lambda c: c == _DOUBLE_QUOTE, before, after, append_first=False,
                                        on_close=on_close)


_short_string_handler = _short_string_handler_factory()


def _quoted_symbol_handler_factory():
    """Generates the quoted symbol (single quoted) handler."""
    def before(c, ctx, is_field_name, is_clob):
        assert not is_clob
        _validate_short_quoted_text(c, ctx, _MAX_TEXT_CHAR)
        return ctx.value, False

    return _quoted_text_handler_factory(
        _SINGLE_QUOTE,
        lambda c: (c != _SINGLE_QUOTE or _is_escaped(c)),
        before,
        _symbol_token_end,
    )

_quoted_symbol_handler = _quoted_symbol_handler_factory()


def _single_quote_handler_factory(on_single_quote, on_other):
    """Generates handlers used for classifying tokens that begin with one or more single quotes.

    Args:
        on_single_quote (callable): Called when another single quote is found. Accepts the current character's ordinal,
            the current context, and True if the token is a field name; returns a Transition.
        on_other (callable): Called when any character other than a single quote is found.  Accepts the current
            character's ordinal, the current context, and True if the token is a field name; returns a Transition.
    """
    @coroutine
    def single_quote_handler(c, ctx, is_field_name=False):
        assert c == _SINGLE_QUOTE
        c, self = yield
        if c == _SINGLE_QUOTE and not _is_escaped(c):
            yield on_single_quote(c, ctx, is_field_name)
        else:
            ctx.set_unicode(quoted_text=True)
            yield on_other(c, ctx, is_field_name)
    return single_quote_handler


_two_single_quotes_handler = _single_quote_handler_factory(
    lambda c, ctx, is_field_name: ctx.set_unicode(quoted_text=True).immediate_transition(
        _long_string_handler(c, ctx, is_field_name)
    ),
    lambda c, ctx, is_field_name:
        ctx.set_ion_type(IonType.SYMBOL).set_pending_symbol().immediate_transition(ctx.whence)  # Empty symbol.
)
_long_string_or_symbol_handler = _single_quote_handler_factory(
    lambda c, ctx, is_field_name:
        ctx.set_ion_type(IonType.SYMBOL).immediate_transition(_two_single_quotes_handler(c, ctx, is_field_name)),
    lambda c, ctx, is_field_name: ctx.immediate_transition(_quoted_symbol_handler(c, ctx, is_field_name))
)


@coroutine
def _struct_or_lob_handler(c, ctx):
    """Handles tokens that begin with an open brace."""
    assert c == _OPEN_BRACE
    c, self = yield
    yield ctx.immediate_transition(_STRUCT_OR_LOB_TABLE[c](c, ctx))


def _b64decode_py2(value):
    # Some versions of python 2 don't support bytearray as input to base64.b64decode.
    return base64.b64decode(six.binary_type(value))

_b64decode = _b64decode_py2 if six.PY2 else base64.b64decode


def _parse_lob(ion_type, value):
    def parse():
        if ion_type is IonType.CLOB:
            byte_value = bytearray()
            for b in value.as_text():
                byte_value.append(ord(b))
            return six.binary_type(byte_value)
        return _b64decode(value)
    return parse


@coroutine
def _lob_start_handler(c, ctx):
    """Handles tokens that begin with two open braces."""
    assert c == _OPEN_BRACE
    c, self = yield
    trans = ctx.immediate_transition(self)
    quotes = 0
    while True:
        if c in _WHITESPACE:
            if quotes > 0:
                _illegal_character(c, ctx)
        elif c == _DOUBLE_QUOTE:
            if quotes > 0:
                _illegal_character(c, ctx)
            ctx.set_ion_type(IonType.CLOB).set_unicode(quoted_text=True)
            yield ctx.immediate_transition(_short_string_handler(c, ctx))
        elif c == _SINGLE_QUOTE:
            if not quotes:
                ctx.set_ion_type(IonType.CLOB).set_unicode(quoted_text=True)
            quotes += 1
            if quotes == 3:
                yield ctx.immediate_transition(_long_string_handler(c, ctx))
        else:
            yield ctx.immediate_transition(_blob_end_handler(c, ctx))
        c, _ = yield trans


def _lob_end_handler_factory(ion_type, action, validate=lambda c, ctx, action_res: None):
    """Generates handlers for the end of blob or clob values.

    Args:
        ion_type (IonType): The type of this lob (either blob or clob).
        action (callable): Called for each non-whitespace, non-closing brace character encountered before the end of
            the lob. Accepts the current character's ordinal, the current context, the previous character's ordinal,
            the result of the previous call to ``action`` (if any), and True if this is the first call to ``action``.
            Returns any state that will be needed by subsequent calls to ``action``. For blobs, this should validate
            the character is valid base64; for clobs, this should ensure there are no illegal characters (e.g. comments)
            between the end of the data and the end of the clob.
        validate (Optional[callable]): Called once the second closing brace has been found. Accepts the current
            character's ordinal, the current context, and the result of the last call to ``action``; raises an error
            if this is not a valid lob value.
    """
    assert ion_type is IonType.BLOB or ion_type is IonType.CLOB

    @coroutine
    def lob_end_handler(c, ctx):
        val = ctx.value
        prev = c
        action_res = None
        if c != _CLOSE_BRACE and c not in _WHITESPACE:
            action_res = action(c, ctx, prev, action_res, True)
        c, self = yield
        trans = ctx.immediate_transition(self)
        while True:
            if c in _WHITESPACE:
                if prev == _CLOSE_BRACE:
                    _illegal_character(c, ctx.set_ion_type(ion_type), 'Expected }.')
            elif c == _CLOSE_BRACE:
                if prev == _CLOSE_BRACE:
                    validate(c, ctx, action_res)
                    break
            else:
                action_res = action(c, ctx, prev, action_res, False)
            prev = c
            c, _ = yield trans
        ctx.set_self_delimiting(True)  # Lob values are self-delimiting (they are terminated by '}}').
        yield ctx.event_transition(IonThunkEvent, IonEventType.SCALAR, ion_type, _parse_lob(ion_type, val))
    return lob_end_handler


def _blob_end_handler_factory():
    """Generates the handler for the end of a blob value. This includes the base-64 data and the two closing braces."""
    def expand_res(res):
        if res is None:
            return 0, 0
        return res

    def action(c, ctx, prev, res, is_first):
        num_digits, num_pads = expand_res(res)
        if c in _BASE64_DIGITS:
            if prev == _CLOSE_BRACE or prev == _BASE64_PAD:
                _illegal_character(c, ctx.set_ion_type(IonType.BLOB))
            num_digits += 1
        elif c == _BASE64_PAD:
            if prev == _CLOSE_BRACE:
                _illegal_character(c, ctx.set_ion_type(IonType.BLOB))
            num_pads += 1
        else:
            _illegal_character(c, ctx.set_ion_type(IonType.BLOB))
        ctx.value.append(c)
        return num_digits, num_pads

    def validate(c, ctx, res):
        num_digits, num_pads = expand_res(res)
        if num_pads > 3 or (num_digits + num_pads) % 4 != 0:
            _illegal_character(c, ctx, 'Incorrect number of pad characters (%d) for a blob of %d base-64 digits.'
                               % (num_pads, num_digits))

    return _lob_end_handler_factory(IonType.BLOB, action, validate)

_blob_end_handler = _blob_end_handler_factory()


def _clob_end_handler_factory():
    """Generates the handler for the end of a clob value. This includes anything from the data's closing quote through
    the second closing brace.
    """
    def action(c, ctx, prev, res, is_first):
        if is_first and ctx.is_self_delimiting and c == _DOUBLE_QUOTE:
            assert c is prev
            return res
        _illegal_character(c, ctx)

    return _lob_end_handler_factory(IonType.CLOB, action)

_clob_end_handler = _clob_end_handler_factory()


_single_quoted_field_name_handler = partial(_long_string_or_symbol_handler, is_field_name=True)
_double_quoted_field_name_handler = partial(_short_string_handler, is_field_name=True)
_unquoted_field_name_handler = partial(_symbol_or_keyword_handler, is_field_name=True)
_symbol_identifier_or_unquoted_field_name_handler = partial(_symbol_identifier_or_unquoted_symbol_handler,
                                                            is_field_name=True)


def _container_start_handler_factory(ion_type, before_yield=lambda c, ctx: None):
    """Generates handlers for tokens that begin with container start characters.

    Args:
        ion_type (IonType): The type of this container.
        before_yield (Optional[callable]): Called at initialization. Accepts the first character's ordinal and the
            current context; performs any necessary initialization actions.
    """
    assert ion_type.is_container

    @coroutine
    def container_start_handler(c, ctx):
        before_yield(c, ctx)
        yield
        yield ctx.event_transition(IonEvent, IonEventType.CONTAINER_START, ion_type, value=None)
    return container_start_handler


# Struct requires unread_byte because we had to read one char past the { to make sure it wasn't a lob.
_struct_handler = _container_start_handler_factory(IonType.STRUCT, lambda c, ctx: ctx.queue.unread(c))
_list_handler = _container_start_handler_factory(IonType.LIST)
_sexp_handler = _container_start_handler_factory(IonType.SEXP)


@coroutine
def _read_data_handler(whence, ctx, complete, can_flush):
    """Creates a co-routine for retrieving data up to a requested size.

    Args:
        whence (Coroutine): The co-routine to return to after the data is satisfied.
        ctx (_HandlerContext): The context for the read.
        complete (True|False): True if STREAM_END should be emitted if no bytes are read or
            available; False if INCOMPLETE should be emitted in that case.
        can_flush (True|False): True if NEXT may be requested after INCOMPLETE is emitted as a result of this data
            request.
    """
    trans = None
    queue = ctx.queue

    while True:
        data_event, self = (yield trans)
        if data_event is not None:
            if data_event.data is not None:
                data = data_event.data
                data_len = len(data)
                if data_len > 0:
                    queue.extend(data)
                    yield Transition(None, whence)
            elif data_event.type is ReadEventType.NEXT:
                queue.mark_eof()
                if not can_flush:
                    _illegal_character(queue.read_byte(), ctx, "Unexpected EOF.")
                yield Transition(None, whence)
        trans = Transition(complete and ION_STREAM_END_EVENT or ION_STREAM_INCOMPLETE_EVENT, self)


_ZERO_START_TABLE = _defaultdict(
    _merge_mappings(
        _WHOLE_NUMBER_TABLE,
        (_DIGITS, _timestamp_zero_start_handler),
        (_BINARY_RADIX, _binary_int_handler),
        (_HEX_RADIX, _hex_int_handler)
    )
)

_NUMBER_OR_TIMESTAMP_TABLE = _defaultdict(
    _merge_mappings(
        {
            _UNDERSCORE: _whole_number_handler,
        },
        _WHOLE_NUMBER_TABLE,
        (_TIMESTAMP_YEAR_DELIMITERS, _timestamp_handler)
    )
)

_NEGATIVE_TABLE = _defaultdict(
    _merge_mappings(
        {
            _ZERO: _number_zero_start_handler,
        },
        (_DIGITS[1:], _whole_number_handler)
    ),
    fallback=_negative_inf_or_sexp_hyphen_handler
)

_STRUCT_OR_LOB_TABLE = _defaultdict({
    _OPEN_BRACE: _lob_start_handler
}, _struct_handler)


_FIELD_NAME_START_TABLE = _defaultdict(
    _merge_mappings(
        {
            _SINGLE_QUOTE: _single_quoted_field_name_handler,
            _DOUBLE_QUOTE: _double_quoted_field_name_handler,
            _DOLLAR_SIGN: _symbol_identifier_or_unquoted_field_name_handler,
        },
        (_IDENTIFIER_STARTS, _unquoted_field_name_handler)
    ),
    fallback=partial(_illegal_character, message='Illegal character in field name.')
)

_VALUE_START_TABLE = _defaultdict(
    _merge_mappings(
        {
            _MINUS: _number_negative_start_handler,
            _PLUS: _positive_inf_or_sexp_plus_handler,
            _ZERO: _number_zero_start_handler,
            _OPEN_BRACE: _struct_or_lob_handler,
            _OPEN_PAREN: _sexp_handler,
            _OPEN_BRACKET: _list_handler,
            _SINGLE_QUOTE: _long_string_or_symbol_handler,
            _DOUBLE_QUOTE: _short_string_handler,
            _DOLLAR_SIGN: _symbol_identifier_or_unquoted_symbol_handler,
        },
        (_DIGITS[1:], _number_or_timestamp_handler)
    ),
    fallback=_symbol_or_keyword_handler
)

_IMMEDIATE_FLUSH_TABLE = _defaultdict(
    _merge_mappings(
        (_DIGITS, True),
        (_LETTERS, True),
        {_DOLLAR_SIGN: True},
    ),
    fallback=lambda: False
)


@coroutine
def _container_handler(c, ctx):
    """Coroutine for container values. Delegates to other coroutines to tokenize all child values."""
    _, self = (yield None)
    queue = ctx.queue
    child_context = None
    is_field_name = ctx.ion_type is IonType.STRUCT
    delimiter_required = False
    complete = ctx.depth == 0
    can_flush = False

    def has_pending_symbol():
        return child_context and child_context.pending_symbol is not None

    def symbol_value_event():
        return child_context.event_transition(
            IonEvent, IonEventType.SCALAR, IonType.SYMBOL, _as_symbol(child_context.pending_symbol))

    def pending_symbol_value():
        if has_pending_symbol():
            assert not child_context.value
            if ctx.ion_type is IonType.STRUCT and child_context.field_name is None:
                _illegal_character(c, ctx,
                                   'Encountered STRUCT value %s without field name.' % (child_context.pending_symbol,))
            return symbol_value_event()
        return None

    def is_value_decorated():
        return child_context is not None and (child_context.annotations or child_context.field_name is not None)

    def _can_flush():
        return child_context is not None and \
               child_context.depth == 0 and \
               (
                   (
                       child_context.ion_type is not None and
                       (
                           child_context.ion_type.is_numeric or
                           (child_context.ion_type.is_text and not ctx.quoted_text and not is_field_name)
                       )
                   ) or
                   (
                       child_context.line_comment and
                       not is_value_decorated()
                   )
               )

    while True:
        # Loop over all values in this container.
        if c in ctx.container.end or c in ctx.container.delimiter or BufferQueue.is_eof(c):
            symbol_event = pending_symbol_value()
            if symbol_event is not None:
                yield symbol_event
                child_context = None
                delimiter_required = ctx.container.is_delimited
            if c in ctx.container.end:
                if not delimiter_required and is_value_decorated():
                    _illegal_character(c, child_context,
                                       'Dangling field name (%s) and/or annotation(s) (%r) at end of container.'
                                       % (child_context.field_name, child_context.annotations))
                # Yield the close event and go to enclosing container. This coroutine instance will never resume.
                yield Transition(
                    IonEvent(IonEventType.CONTAINER_END, ctx.ion_type, depth=ctx.depth-1),
                    ctx.whence
                )
                raise ValueError('Resumed a finished container handler.')
            elif c in ctx.container.delimiter:
                if not delimiter_required:
                    _illegal_character(c, ctx.derive_child_context(None),
                                       'Encountered delimiter %s without preceding value.'
                                       % (_chr(ctx.container.delimiter[0]),))
                is_field_name = ctx.ion_type is IonType.STRUCT
                delimiter_required = False
                c = None
            else:
                assert BufferQueue.is_eof(c)
                assert len(queue) == 0
                yield ctx.read_data_event(self, complete=True)
                c = None
        if c is not None and c not in _WHITESPACE:
            can_flush = False
            if c == _SLASH:
                if child_context is None:
                    # This is the start of a new child value (or, if this is a comment, a new value will start after the
                    # comment ends).
                    child_context = ctx.derive_child_context(self)
                if ctx.ion_type is IonType.SEXP:
                    handler = _sexp_slash_handler(c, child_context, pending_event=pending_symbol_value())
                else:
                    handler = _comment_handler(c, child_context, self)
            elif delimiter_required:
                # This is not the delimiter, or whitespace, or the start of a comment. Throw.
                _illegal_character(c, ctx.derive_child_context(None), 'Delimiter %s not found after value.'
                                   % (_chr(ctx.container.delimiter[0]),))
            elif has_pending_symbol():
                # A character besides whitespace, comments, and delimiters has been found, and there is a pending
                # symbol. That pending symbol is either an annotation, a field name, or a symbol value.
                if c == _COLON:
                    if is_field_name:
                        is_field_name = False
                        child_context.set_field_name()
                        c = None
                    else:
                        assert not ctx.quoted_text
                        if len(queue) == 0:
                            yield ctx.read_data_event(self)
                        c = queue.read_byte()
                        if c == _COLON:
                            child_context.set_annotation()
                            c = None  # forces another character to be read safely
                        else:
                            # Colon that doesn't indicate a field name or annotation.
                            _illegal_character(c, child_context)
                else:
                    if is_field_name:
                        _illegal_character(c, child_context, 'Illegal character after field name %s.'
                                           % child_context.pending_symbol)
                    # It's a symbol value delimited by something other than a comma (i.e. whitespace or comment)
                    yield symbol_value_event()
                    child_context = None
                    delimiter_required = ctx.container.is_delimited
                continue
            else:
                if not is_value_decorated():
                    # This is the start of a new child value.
                    child_context = ctx.derive_child_context(self)
                if is_field_name:
                    handler = _FIELD_NAME_START_TABLE[c](c, child_context)
                else:
                    handler = _VALUE_START_TABLE[c](c, child_context)  # Initialize the new handler
                    can_flush = _IMMEDIATE_FLUSH_TABLE[c]
            container_start = c == _OPEN_BRACKET or \
                              c == _OPEN_PAREN  # _OPEN_BRACE might start a lob; that is handled elsewhere.
            quoted_start = c == _DOUBLE_QUOTE or c == _SINGLE_QUOTE
            while True:
                # Loop over all characters in the current token. A token is either a non-symbol value or a pending
                # symbol, which may end up being a field name, annotation, or symbol value.
                if container_start:
                    c = None
                    container_start = False
                else:
                    if child_context.quoted_text or quoted_start:
                        quoted_start = False
                        yield child_context.next_code_point(self)
                        c = child_context.code_point
                    else:
                        if len(queue) == 0:
                            yield ctx.read_data_event(self, can_flush=can_flush)
                        c = queue.read_byte()
                trans = handler.send((c, handler))
                if trans.event is not None:
                    is_self_delimiting = False
                    if child_context.is_composite:
                        # This is a composite transition, i.e. it is an event transition followed by an immediate
                        # transition to the handler coroutine for the next token.
                        next_transition = trans.next_transition
                        child_context = trans.next_context
                        assert next_transition is None or next_transition.event is None
                    else:
                        next_transition = None
                        is_self_delimiting = child_context.is_self_delimiting
                        child_context = None
                    # This child value is finished. c is now the first character in the next value or sequence.
                    # Hence, a new character should not be read; it should be provided to the handler for the next
                    # child context.
                    yield trans
                    event_ion_type = trans.event.ion_type  # None in the case of IVM event.
                    is_container = event_ion_type is not None and event_ion_type.is_container and \
                        trans.event.event_type is not IonEventType.SCALAR
                    if is_container:
                        assert next_transition is None
                        yield Transition(
                            None,
                            _container_handler(c, ctx.derive_container_context(trans.event.ion_type, self))
                        )
                    complete = ctx.depth == 0
                    can_flush = False
                    if is_container or is_self_delimiting:
                        # The end of the value has been reached, and c needs to be updated
                        assert not ctx.quoted_text
                        if len(queue) == 0:
                            yield ctx.read_data_event(self, complete, can_flush)
                        c = queue.read_byte()
                    delimiter_required = ctx.container.is_delimited
                    if next_transition is None:
                        break
                    else:
                        trans = next_transition
                elif self is trans.delegate:
                    child_context.set_ion_type(None)  # The next token will determine the type.
                    complete = False
                    can_flush = _can_flush()
                    if is_field_name:
                        assert not can_flush
                        if c == _COLON or not child_context.is_self_delimiting:
                            break
                    elif has_pending_symbol():
                        can_flush = ctx.depth == 0
                        if not child_context.is_self_delimiting or child_context.line_comment:
                            break
                    elif child_context.is_self_delimiting:
                        # This is the end of a comment. If this is at the top level and is un-annotated,
                        # it may end the stream.
                        complete = ctx.depth == 0 and not is_value_decorated()
                    # This happens at the end of a comment within this container, or when a symbol token has been
                    # found. In both cases, an event should not be emitted. Read the next character and continue.
                    if len(queue) == 0:
                        yield ctx.read_data_event(self, complete, can_flush)
                    c = queue.read_byte()
                    break
                # This is an immediate transition to a handler (may be the same one) for the current token.
                can_flush = _can_flush()
                handler = trans.delegate
        else:
            assert not ctx.quoted_text
            if len(queue) == 0:
                yield ctx.read_data_event(self, complete, can_flush)
            c = queue.read_byte()


@coroutine
def _skip_trampoline(handler):
    """Intercepts events from container handlers, emitting them only if they should not be skipped."""
    data_event, self = (yield None)
    delegate = handler
    event = None
    depth = 0
    while True:
        def pass_through():
            _trans = delegate.send(Transition(data_event, delegate))
            return _trans, _trans.delegate, _trans.event

        if data_event is not None and data_event.type is ReadEventType.SKIP:
            while True:
                trans, delegate, event = pass_through()
                if event is not None:
                    if event.event_type is IonEventType.CONTAINER_END and event.depth <= depth:
                        break
                if event is None or event.event_type is IonEventType.INCOMPLETE:
                    data_event, _ = yield Transition(event, self)
        else:
            trans, delegate, event = pass_through()
            if event is not None and (event.event_type is IonEventType.CONTAINER_START or
                                      event.event_type is IonEventType.CONTAINER_END):
                depth = event.depth
        data_event, _ = yield Transition(event, self)


_next_code_point_iter = partial(_next_code_point, yield_char=_NARROW_BUILD)


@coroutine
def _next_code_point_handler(whence, ctx):
    """Retrieves the next code point from within a quoted string or symbol."""
    data_event, self = yield
    queue = ctx.queue
    unicode_escapes_allowed = ctx.ion_type is not IonType.CLOB
    escaped_newline = False
    escape_sequence = b''
    low_surrogate_required = False
    while True:
        if len(queue) == 0:
            yield ctx.read_data_event(self)
        queue_iter = iter(queue)
        code_point_generator = _next_code_point_iter(queue, queue_iter)
        code_point = next(code_point_generator)
        if code_point == _BACKSLASH:
            escape_sequence += six.int2byte(_BACKSLASH)
            num_digits = None
            while True:
                if len(queue) == 0:
                    yield ctx.read_data_event(self)
                code_point = next(queue_iter)
                if six.indexbytes(escape_sequence, -1) == _BACKSLASH:
                    if code_point == _ord(b'u') and unicode_escapes_allowed:
                        # 4-digit unicode escapes, plus '\u' for each surrogate
                        num_digits = 12 if low_surrogate_required else 6
                        low_surrogate_required = False
                    elif low_surrogate_required:
                        _illegal_character(code_point, ctx,
                                           'Unpaired high surrogate escape sequence %s.' % (escape_sequence,))
                    elif code_point == _ord(b'x'):
                        num_digits = 4  # 2-digit hex escapes
                    elif code_point == _ord(b'U') and unicode_escapes_allowed:
                        num_digits = 10  # 8-digit unicode escapes
                    elif code_point in _COMMON_ESCAPES:
                        if code_point == _SLASH or code_point == _QUESTION_MARK:
                            escape_sequence = b''  # Drop the \. Python does not recognize these as escapes.
                        escape_sequence += six.int2byte(code_point)
                        break
                    elif code_point in _NEWLINES:
                        escaped_newline = True
                        break
                    else:
                        # This is a backslash followed by an invalid escape character. This is illegal.
                        _illegal_character(code_point, ctx, 'Invalid escape sequence \\%s.' % (_chr(code_point),))
                    escape_sequence += six.int2byte(code_point)
                else:
                    if code_point not in _HEX_DIGITS:
                        _illegal_character(code_point, ctx,
                                           'Non-hex character %s found in unicode escape.' % (_chr(code_point),))
                    escape_sequence += six.int2byte(code_point)
                    if len(escape_sequence) == num_digits:
                        break
            if not escaped_newline:
                decoded_escape_sequence = escape_sequence.decode('unicode-escape')
                cp_iter = _next_code_point_iter(decoded_escape_sequence, iter(decoded_escape_sequence), to_int=ord)
                code_point = next(cp_iter)
                if code_point is None:
                    # This is a high surrogate. Restart the loop to gather the low surrogate.
                    low_surrogate_required = True
                    continue
                code_point = CodePoint(code_point)
                code_point.char = decoded_escape_sequence
                code_point.is_escaped = True
                ctx.set_code_point(code_point)
                yield Transition(None, whence)
        elif low_surrogate_required:
            _illegal_character(code_point, ctx, 'Unpaired high surrogate escape sequence %s.' % (escape_sequence,))
        if code_point == _CARRIAGE_RETURN:
            # Normalize all newlines (\r, \n, and \r\n) to \n .
            if len(queue) == 0:
                yield ctx.read_data_event(self)
            code_point = next(queue_iter)
            if code_point != _NEWLINE:
                queue.unread(code_point)
                code_point = _NEWLINE
        while code_point is None:
            yield ctx.read_data_event(self)
            code_point = next(code_point_generator)
        if escaped_newline:
            code_point = CodePoint(code_point)
            code_point.char = _ESCAPED_NEWLINE
            code_point.is_escaped = True
        ctx.set_code_point(code_point)
        yield Transition(None, whence)


[docs]def reader(queue=None, is_unicode=False): """Returns a raw binary reader co-routine. Args: queue (Optional[BufferQueue]): The buffer read data for parsing, if ``None`` a new one will be created. is_unicode (Optional[bool]): True if all input data to this reader will be of unicode text type; False if all input data to this reader will be of binary type. Yields: IonEvent: parse events, will have an event type of ``INCOMPLETE`` if data is needed in the middle of a value or ``STREAM_END`` if there is no data **and** the parser is not in the middle of parsing a value. Receives :class:`DataEvent`, with :class:`ReadEventType` of ``NEXT`` or ``SKIP`` to iterate over values; ``DATA`` or ``NEXT`` if the last event type was ``INCOMPLETE``; or ``DATA`` if the last event type was ``STREAM_END``. When the reader receives ``NEXT`` after yielding ``INCOMPLETE``, this signals to the reader that no further data is coming, and that any pending data should be flushed as either parse events or errors. This is **only** valid at the top-level, and will **only** result in a parse event if the last character encountered... * was a digit or a decimal point in a non-timestamp, non-keyword numeric value; OR * ended a valid partial timestamp; OR * ended a keyword value (special floats, booleans, ``null``, and typed nulls); OR * was part of an unquoted symbol token, or whitespace or the end of a comment following an unquoted symbol token (as long as no colons were encountered after the token); OR * was the closing quote of a quoted symbol token, or whitespace or the end of a comment following a quoted symbol token (as long as no colons were encountered after the token); OR * was the final closing quote of a long string, or whitespace or the end of a comment following a long string. If the reader successfully yields a parse event as a result of this, ``NEXT`` is the only input that may immediately follow. At that point, there are only two possible responses from the reader: * If the last character read was the closing quote of an empty symbol following a long string, the reader will emit a parse event representing a symbol value with empty text. The next reader input/output event pair must be (``NEXT``, ``STREAM_END``). * Otherwise, the reader will emit ``STREAM_END``. After that ``STREAM_END``, the user may later provide ``DATA`` to resume reading. If this occurs, the new data will be interpreted as if it were at the start of the stream (i.e. it can never continue the previous value), except that it occurs within the same symbol table context. This has the following implications (where ``<FLUSH>`` stands for the (``INCOMPLETE``, ``NEXT``) transaction): * If the previously-emitted value was a numeric value (``int``, ``float``, ``decimal``, ``timestamp``), the new data will never extend that value, even if it would be a valid continuation. For example, ``123<FLUSH>456`` will always be emitted as two parse events (ints ``123`` and ``456``), even though it would have been interpreted as ``123456`` without the ``<FLUSH>``. * If the previously-emitted value was a symbol value or long string, the new data will be interpreted as the start of a new value. For example, ``abc<FLUSH>::123`` will be emitted as the symbol value ``'abc'``, followed by an error upon encountering ':' at the start of a value, even though it would have been interpreted as the ``int`` ``123`` annotated with ``'abc'`` without the ``<FLUSH>``. The input ``abc<FLUSH>abc`` will be emitted as the symbol value ``'abc'`` (represented by a :class:`SymbolToken`), followed by another symbol value ``'abc'`` (represented by a ``SymbolToken`` with the same symbol ID), even though it would have been interpreted as ``'abcabc'`` without the ``<FLUSH>``. Similarly, ``'''abc'''<FLUSH>'''def'''`` will the interpreted as two strings (``'abc'`` and ``'def'``), even though it would have been interpreted as ``'abcdef'`` without the ``<FLUSH>``. ``SKIP`` is only allowed within a container. A reader is *in* a container when the ``CONTAINER_START`` event type is encountered and *not in* a container when the ``CONTAINER_END`` event type for that container is encountered. """ if queue is None: queue = BufferQueue(is_unicode) ctx = _HandlerContext( container=_C_TOP_LEVEL, queue=queue, field_name=None, annotations=None, depth=0, whence=None, value=None, ion_type=None, # Top level pending_symbol=None ) return reader_trampoline(_skip_trampoline(_container_handler(None, ctx)), allow_flush=True)
text_reader = reader