correctness/test-parse-random/runtests.py

#!/usr/bin/env python2.7
#
# Copyright 2015 The Rust Project Developers. See the COPYRIGHT
# file at the top-level directory of this distribution and at
# http://rust-lang.org/COPYRIGHT.
#
# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
# http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
# <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
# option. This file may not be copied, modified, or distributed
# except according to those terms.

"""
Testing dec2flt
===============
These are *really* extensive tests. Expect them to run for hours. Due to the
nature of the problem (the input is a string of arbitrary length), exhaustive
testing is not really possible. Instead, there are exhaustive tests for some
classes of inputs for which that is feasible and a bunch of deterministic and
random non-exhaustive tests for covering everything else.

The actual tests (generating decimal strings and feeding them to dec2flt) is
performed by a set of stand-along rust programs. This script compiles, runs,
and supervises them. The programs report the strings they generate and the
floating point numbers they converted those strings to, and this script
checks that the results are correct.

You can run specific tests rather than all of them by giving their names
(without .rs extension) as command line parameters.

Verification
------------
The tricky part is not generating those inputs but verifying the outputs.
Comparing with the result of Python's float() does not cut it because
(and this is apparently undocumented) although Python includes a version of
Martin Gay's code including the decimal-to-float part, it doesn't actually use
it for float() (only for round()) instead relying on the system scanf() which
is not necessarily completely accurate.

Instead, we take the input and compute the true value with bignum arithmetic
(as a fraction, using the ``fractions`` module).

Given an input string and the corresponding float computed via Rust, simply
decode the float into f * 2^k (for integers f, k) and the ULP.
We can now easily compute the error and check if it is within 0.5 ULP as it
should be. Zero and infinites are handled similarly:

- If the approximation is 0.0, the exact value should be *less or equal*
  half the smallest denormal float: the smallest denormal floating point
  number has an odd mantissa (00...001) and thus half of that is rounded
  to 00...00, i.e., zero.
- If the approximation is Inf, the exact value should be *greater or equal*
  to the largest finite float + 0.5 ULP: the largest finite float has an odd
  mantissa (11...11), so that plus half an ULP is rounded up to the nearest
  even number, which overflows.

Implementation details
----------------------
This directory contains a set of single-file Rust programs that perform
tests with a particular class of inputs. Each is compiled and run without
parameters, outputs (f64, f32, decimal) pairs to verify externally, and
in any case either exits gracefully or with a panic.

If a test binary writes *anything at all* to stderr or exits with an
exit code that's not 0, the test fails.
The output on stdout is treated as (f64, f32, decimal) record, encoded thusly:

- First, the bits of the f64 encoded as an ASCII hex string.
- Second, the bits of the f32 encoded as an ASCII hex string.
- Then the corresponding string input, in ASCII
- The record is terminated with a newline.

Incomplete records are an error. Not-a-Number bit patterns are invalid too.

The tests run serially but the validation for a single test is parallelized
with ``multiprocessing``. Each test is launched as a subprocess.
One thread supervises it: Accepts and enqueues records to validate, observe
stderr, and waits for the process to exit. A set of worker processes perform
the validation work for the outputs enqueued there. Another thread listens
for progress updates from the workers.

Known issues
------------
Some errors (e.g., NaN outputs) aren't handled very gracefully.
Also, if there is an exception or the process is interrupted (at least on
Windows) the worker processes are leaked and stick around forever.
They're only a few megabytes each, but still, this script should not be run
if you aren't prepared to manually kill a lot of orphaned processes.
"""
from __future__ import print_function
import sys
import os
import time
import struct
from fractions import Fraction
from collections import namedtuple
from subprocess import Popen, check_call, PIPE
from glob import glob
import multiprocessing
import threading
import ctypes
import binascii

try:  # Python 3
    import queue as Queue
except ImportError:  # Python 2
    import Queue

NUM_WORKERS = 2
UPDATE_EVERY_N = 50000
INF = namedtuple('INF', '')()
NEG_INF = namedtuple('NEG_INF', '')()
ZERO = namedtuple('ZERO', '')()
MAILBOX = None  # The queue for reporting errors to the main process.
STDOUT_LOCK = threading.Lock()
test_name = None
child_processes = []
exit_status = 0

def msg(*args):
    with STDOUT_LOCK:
        print("[" + test_name + "]", *args)
        sys.stdout.flush()


def write_errors():
    global exit_status
    f = open("errors.txt", 'w')
    have_seen_error = False
    while True:
        args = MAILBOX.get()
        if args is None:
            f.close()
            break
        print(*args, file=f)
        f.flush()
        if not have_seen_error:
            have_seen_error = True
            msg("Something is broken:", *args)
            msg("Future errors logged to errors.txt")
            exit_status = 101

def projectdir():
    file = os.path.realpath(__file__)
    return os.path.dirname(os.path.dirname(file))

def targetdir():
    return os.path.join(projectdir(), 'target')

def releasedir():
    return os.path.join(targetdir(), 'release')

def cargo():
    path = os.getcwd()
    os.chdir(projectdir())
    check_call(['cargo', 'build', '--release'])
    os.chdir(path)


def run(test):
    global test_name
    test_name = test

    t0 = time.perf_counter()
    msg("setting up supervisor")
    command = ['cargo', 'run', '--bin', test, '--release']
    proc = Popen(command, bufsize=1<<20 , stdin=PIPE, stdout=PIPE, stderr=PIPE)
    done = multiprocessing.Value(ctypes.c_bool)
    queue = multiprocessing.Queue(maxsize=5)#(maxsize=1024)
    workers = []
    for n in range(NUM_WORKERS):
        worker = multiprocessing.Process(name='Worker-' + str(n + 1),
                                         target=init_worker,
                                         args=[test, MAILBOX, queue, done])
        workers.append(worker)
        child_processes.append(worker)
    for worker in workers:
        worker.start()
    msg("running test")
    interact(proc, queue)
    with done.get_lock():
        done.value = True
    for worker in workers:
        worker.join()
    msg("python is done")
    assert queue.empty(), "did not validate everything"
    dt = time.perf_counter() - t0
    msg("took", round(dt, 3), "seconds")


def interact(proc, queue):
    n = 0
    while proc.poll() is None:
        line = proc.stdout.readline()
        if not line:
            continue
        assert line.endswith(b'\n'), "incomplete line: " + repr(line)
        queue.put(line)
        n += 1
        if n % UPDATE_EVERY_N == 0:
            msg("got", str(n // 1000) + "k", "records")
    msg("rust is done. exit code:", proc.returncode)
    rest, stderr = proc.communicate()
    if stderr:
        msg("rust stderr output:", stderr)
    for line in rest.split(b'\n'):
        if not line:
            continue
        queue.put(line)


def main():
    global MAILBOX
    files = glob(f'{projectdir()}/test-parse-random/*.rs')
    basenames = [os.path.basename(i) for i in files]
    all_tests = [os.path.splitext(f)[0] for f in basenames if not f.startswith('_')]
    args = sys.argv[1:]
    if args:
        tests = [test for test in all_tests if test in args]
    else:
        tests = all_tests
    if not tests:
        print("Error: No tests to run")
        sys.exit(1)
    # Compile first for quicker feedback
    cargo()
    # Set up mailbox once for all tests
    MAILBOX = multiprocessing.Queue()
    mailman = threading.Thread(target=write_errors)
    mailman.daemon = True
    mailman.start()
    for test in tests:
        run(test)
    MAILBOX.put(None)
    mailman.join()


# ---- Worker thread code ----


POW2 = { e: Fraction(2) ** e for e in range(-1100, 1100) }
HALF_ULP = { e: (Fraction(2) ** e)/2 for e in range(-1100, 1100) }
DONE_FLAG = None


def send_error_to_supervisor(*args):
    MAILBOX.put(args)


def init_worker(test, mailbox, queue, done):
    global test_name, MAILBOX, DONE_FLAG
    test_name = test
    MAILBOX = mailbox
    DONE_FLAG = done
    do_work(queue)


def is_done():
    with DONE_FLAG.get_lock():
        return DONE_FLAG.value


def do_work(queue):
    while True:
        try:
            line = queue.get(timeout=0.01)
        except Queue.Empty:
            if queue.empty() and is_done():
                return
            else:
                continue
        bin64, bin32, text = line.rstrip().split()
        validate(bin64, bin32, text.decode('utf-8'))


def decode_binary64(x):
    """
    Turn a IEEE 754 binary64 into (mantissa, exponent), except 0.0 and
    infinity (positive and negative), which return ZERO, INF, and NEG_INF
    respectively.
    """
    x = binascii.unhexlify(x)
    assert len(x) == 8, repr(x)
    [bits] = struct.unpack(b'>Q', x)
    if bits == 0:
        return ZERO
    exponent = (bits >> 52) & 0x7FF
    negative = bits >> 63
    low_bits = bits & 0xFFFFFFFFFFFFF
    if exponent == 0:
        mantissa = low_bits
        exponent += 1
        if mantissa == 0:
            return ZERO
    elif exponent == 0x7FF:
        assert low_bits == 0, "NaN"
        if negative:
            return NEG_INF
        else:
            return INF
    else:
        mantissa = low_bits | (1 << 52)
    exponent -= 1023 + 52
    if negative:
        mantissa = -mantissa
    return (mantissa, exponent)


def decode_binary32(x):
    """
    Turn a IEEE 754 binary32 into (mantissa, exponent), except 0.0 and
    infinity (positive and negative), which return ZERO, INF, and NEG_INF
    respectively.
    """
    x = binascii.unhexlify(x)
    assert len(x) == 4, repr(x)
    [bits] = struct.unpack(b'>I', x)
    if bits == 0:
        return ZERO
    exponent = (bits >> 23) & 0xFF
    negative = bits >> 31
    low_bits = bits & 0x7FFFFF
    if exponent == 0:
        mantissa = low_bits
        exponent += 1
        if mantissa == 0:
            return ZERO
    elif exponent == 0xFF:
        if negative:
            return NEG_INF
        else:
            return INF
    else:
        mantissa = low_bits | (1 << 23)
    exponent -= 127 + 23
    if negative:
        mantissa = -mantissa
    return (mantissa, exponent)


MIN_SUBNORMAL_DOUBLE = Fraction(2) ** -1074
MIN_SUBNORMAL_SINGLE = Fraction(2) ** -149  # XXX unsure
MAX_DOUBLE = (2 - Fraction(2) ** -52) * (2 ** 1023)
MAX_SINGLE = (2 - Fraction(2) ** -23) * (2 ** 127)
MAX_ULP_DOUBLE = 1023 - 52
MAX_ULP_SINGLE = 127 - 23
DOUBLE_ZERO_CUTOFF = MIN_SUBNORMAL_DOUBLE / 2
DOUBLE_INF_CUTOFF = MAX_DOUBLE + 2 ** (MAX_ULP_DOUBLE - 1)
SINGLE_ZERO_CUTOFF = MIN_SUBNORMAL_SINGLE / 2
SINGLE_INF_CUTOFF = MAX_SINGLE + 2 ** (MAX_ULP_SINGLE - 1)

def validate(bin64, bin32, text):
    try:
        double = decode_binary64(bin64)
    except AssertionError:
        print(bin64, bin32, text)
        raise
    single = decode_binary32(bin32)
    real = Fraction(text)

    if double is ZERO:
        if real > DOUBLE_ZERO_CUTOFF:
            record_special_error(text, "f64 zero")
    elif double is INF:
        if real < DOUBLE_INF_CUTOFF:
            record_special_error(text, "f64 inf")
    elif double is NEG_INF:
        if -real < DOUBLE_INF_CUTOFF:
            record_special_error(text, "f64 -inf")
    elif len(double) == 2:
        sig, k = double
        validate_normal(text, real, sig, k, "f64")
    else:
        assert 0, "didn't handle binary64"
    if single is ZERO:
        if real > SINGLE_ZERO_CUTOFF:
            record_special_error(text, "f32 zero")
    elif single is INF:
        if real < SINGLE_INF_CUTOFF:
            record_special_error(text, "f32 inf")
    elif single is NEG_INF:
        if -real < SINGLE_INF_CUTOFF:
            record_special_error(text, "f32 -inf")
    elif len(single) == 2:
        sig, k = single
        validate_normal(text, real, sig, k, "f32")
    else:
        assert 0, "didn't handle binary32"

def record_special_error(text, descr):
    send_error_to_supervisor(text.strip(), "wrongly rounded to", descr)


def validate_normal(text, real, sig, k, kind):
    approx = sig * POW2[k]
    error = abs(approx - real)
    if error > HALF_ULP[k]:
        record_normal_error(text, error, k, kind)


def record_normal_error(text, error, k, kind):
    one_ulp = HALF_ULP[k + 1]
    assert one_ulp == 2 * HALF_ULP[k]
    relative_error = error / one_ulp
    text = text.strip()
    try:
        err_repr = float(relative_error)
    except ValueError:
        err_repr = str(err_repr).replace('/', ' / ')
    send_error_to_supervisor(err_repr, "ULP error on", text, "(" + kind + ")")


if __name__ == '__main__':
    main()