test/cintltst/ucnvseltst.c

// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/********************************************************************
 * Copyright (c) 1997-2014, International Business Machines
 * Corporation and others. All Rights Reserved.
 ********************************************************************
 *
 * File UCNVSELTST.C
 *
 * Modification History:
 *        Name                     Description
 *     MOHAMED ELDAWY               Creation
 ********************************************************************
 */

/* C API AND FUNCTIONALITY TEST FOR CONVERTER SELECTOR (ucnvsel.h)*/

#include "ucnvseltst.h"

#include <stdbool.h>
#include <stdio.h>

#include "unicode/utypes.h"
#include "unicode/ucnvsel.h"
#include "unicode/ustring.h"
#include "unicode/utf8.h"
#include "cmemory.h"
#include "cstring.h"
#include "propsvec.h"

#define FILENAME_BUFFER 1024

#define TDSRCPATH  ".." U_FILE_SEP_STRING "test" U_FILE_SEP_STRING "testdata" U_FILE_SEP_STRING

static void TestSelector(void);
static void TestUPropsVector(void);
void addCnvSelTest(TestNode** root);  /* Declaration required to suppress compiler warnings. */

void addCnvSelTest(TestNode** root)
{
    addTest(root, &TestSelector, "tsconv/ucnvseltst/TestSelector");
    addTest(root, &TestUPropsVector, "tsconv/ucnvseltst/TestUPropsVector");
}

static const char **gAvailableNames = NULL;
static int32_t gCountAvailable = 0;

static UBool
getAvailableNames() {
  int32_t i;
  if (gAvailableNames != NULL) {
    return true;
  }
  gCountAvailable = ucnv_countAvailable();
  if (gCountAvailable == 0) {
    log_data_err("No converters available.\n");
    return false;
  }
  gAvailableNames = (const char **)uprv_malloc(gCountAvailable * sizeof(const char *));
  if (gAvailableNames == NULL) {
    log_err("unable to allocate memory for %ld available converter names\n",
            (long)gCountAvailable);
    return false;
  }
  for (i = 0; i < gCountAvailable; ++i) {
    gAvailableNames[i] = ucnv_getAvailableName(i);
  }
  return true;
}

static void
releaseAvailableNames() {
  uprv_free((void *)gAvailableNames);
  gAvailableNames = NULL;
  gCountAvailable = 0;
}

static const char **
getEncodings(int32_t start, int32_t step, int32_t count, int32_t *pCount) {
  const char **names;
  int32_t i;

  *pCount = 0;
  if (count <= 0) {
    return NULL;
  }
  names = (const char **)uprv_malloc(count * sizeof(char *));
  if (names == NULL) {
    log_err("memory allocation error for %ld pointers\n", (long)count);
    return NULL;
  }
  if (step == 0 && count > 0) {
    step = 1;
  }
  for (i = 0; i < count; ++i) {
    if (0 <= start && start < gCountAvailable) {
      names[i] = gAvailableNames[start];
      start += step;
      ++*pCount;
    }
  }
  return names;
}

#if 0
/*
 * ucnvsel_open() does not support "no encodings":
 * Given 0 encodings it will open a selector for all available ones.
 */
static const char **
getNoEncodings(int32_t *pCount) {
  *pCount = 0;
  return NULL;
}
#endif

static const char **
getOneEncoding(int32_t *pCount) {
  return getEncodings(1, 0, 1, pCount);
}

static const char **
getFirstEvenEncodings(int32_t *pCount) {
  return getEncodings(0, 2, 25, pCount);
}

static const char **
getMiddleEncodings(int32_t *pCount) {
  return getEncodings(gCountAvailable - 12, 1, 22, pCount);
}

static const char **
getLastEncodings(int32_t *pCount) {
  return getEncodings(gCountAvailable - 1, -1, 25, pCount);
}

static const char **
getSomeEncodings(int32_t *pCount) {
  /* 20 evenly distributed */
  return getEncodings(5, (gCountAvailable + 19)/ 20, 20, pCount);
}

static const char **
getEveryThirdEncoding(int32_t *pCount) {
  return getEncodings(2, 3, (gCountAvailable + 2 )/ 3, pCount);
}

static const char **
getAllEncodings(int32_t *pCount) {
  return getEncodings(0, 1, gCountAvailable, pCount);
}

typedef const char **GetEncodingsFn(int32_t *);

static GetEncodingsFn *const getEncodingsFns[] = {
  getOneEncoding,
  getFirstEvenEncodings,
  getMiddleEncodings,
  getLastEncodings,
  getSomeEncodings,
  getEveryThirdEncoding,
  getAllEncodings
};

static FILE *fopenOrError(const char *filename) {
    int32_t needLen;
    FILE *f;
    char fnbuf[FILENAME_BUFFER];
    const char* directory = ctest_dataSrcDir();
    needLen = (int32_t)(uprv_strlen(directory) + uprv_strlen(TDSRCPATH) + uprv_strlen(filename) + 1);
    if(needLen > FILENAME_BUFFER) {
        log_err("FAIL: Could not load %s. Filename buffer overflow, needed %d but buffer is %d\n",
                filename, needLen, FILENAME_BUFFER);
        return NULL;
    }

    strcpy(fnbuf, directory);
    strcat(fnbuf, TDSRCPATH);
    strcat(fnbuf, filename);

    f = fopen(fnbuf, "rb");

    if(f == NULL) {
        log_data_err("FAIL: Could not load %s [%s]\n", fnbuf, filename);
    }
    return f;
}

typedef struct TestText {
  char *text, *textLimit;
  char *limit;
  int32_t number;
} TestText;

static void
text_reset(TestText *tt) {
  tt->limit = tt->text;
  tt->number = 0;
}

static char *
text_nextString(TestText *tt, int32_t *pLength) {
  char *s = tt->limit;
  if (s == tt->textLimit) {
    /* we already delivered the last string */
    return NULL;
  } else if (s == tt->text) {
    /* first string */
    if ((tt->textLimit - tt->text) >= 3 &&
        s[0] == (char)0xef && s[1] == (char)0xbb && s[2] == (char)0xbf
    ) {
      s += 3;  /* skip the UTF-8 signature byte sequence (U+FEFF) */
    }
  } else {
    /* skip the string terminator */
    ++s;
    ++tt->number;
  }

  /* find the end of this string */
  tt->limit = uprv_strchr(s, 0);
  *pLength = (int32_t)(tt->limit - s);
  return s;
}

static UBool
text_open(TestText *tt) {
  FILE *f;
  char *s;
  int32_t length;
  uprv_memset(tt, 0, sizeof(TestText));
  f = fopenOrError("ConverterSelectorTestUTF8.txt");
  if(!f) {
    return false;
  }
  fseek(f, 0, SEEK_END);
  length = (int32_t)ftell(f);
  fseek(f, 0, SEEK_SET);
  tt->text = (char *)uprv_malloc(length + 1);
  if (tt->text == NULL) {
    fclose(f);
    return false;
  }
  if (length != (int32_t)fread(tt->text, 1, length, f)) {
    log_err("error reading %ld bytes from test text file\n", (long)length);
    length = 0;
    uprv_free(tt->text);
  }
  fclose(f);
  tt->textLimit = tt->text + length;
  *tt->textLimit = 0;
  /* replace all Unicode '#' (U+0023) with NUL */
  for(s = tt->text; (s = uprv_strchr(s, 0x23)) != NULL; *s++ = 0) {}
  text_reset(tt);
  return true;
}

static void
text_close(TestText *tt) {
  uprv_free(tt->text);
}

static int32_t findIndex(const char* converterName) {
  int32_t i;
  for (i = 0 ; i < gCountAvailable; i++) {
    if(ucnv_compareNames(gAvailableNames[i], converterName) == 0) {
      return i;
    }
  }
  return -1;
}

static UBool *
getResultsManually(const char** encodings, int32_t num_encodings,
                   const char *utf8, int32_t length,
                   const USet* excludedCodePoints, const UConverterUnicodeSet whichSet) {
  UBool* resultsManually;
  int32_t i;

  resultsManually = (UBool*) uprv_malloc(gCountAvailable);
  uprv_memset(resultsManually, 0, gCountAvailable);

  for(i = 0 ; i < num_encodings ; i++) {
    UErrorCode status = U_ZERO_ERROR;
    /* get unicode set for that converter */
    USet* set;
    UConverter* test_converter;
    UChar32 cp;
    int32_t encIndex, offset;

    set = uset_openEmpty();
    test_converter = ucnv_open(encodings[i], &status);
    ucnv_getUnicodeSet(test_converter, set,
                       whichSet, &status);
    if (excludedCodePoints != NULL) {
      uset_addAll(set, excludedCodePoints);
    }
    uset_freeze(set);
    offset = 0;
    cp = 0;

    encIndex = findIndex(encodings[i]);
    /*
     * The following is almost, but not entirely, the same as
     * resultsManually[encIndex] =
     *   (UBool)(uset_spanUTF8(set, utf8, length, USET_SPAN_SIMPLE) == length);
     * They might be different if the set contains strings,
     * or if the utf8 string contains an illegal sequence.
     *
     * The UConverterSelector does not currently handle strings that can be
     * converted, and it treats an illegal sequence as convertible
     * while uset_spanUTF8() treats it like U+FFFD which may not be convertible.
     */
    resultsManually[encIndex] = true;
    while(offset<length) {
      U8_NEXT(utf8, offset, length, cp);
      if (cp >= 0 && !uset_contains(set, cp)) {
        resultsManually[encIndex] = false;
        break;
      }
    }
    uset_close(set);
    ucnv_close(test_converter);
  }
  return resultsManually;
}

/* closes res but does not free resultsManually */
static void verifyResult(UEnumeration* res, const UBool *resultsManually) {
  UBool* resultsFromSystem = (UBool*) uprv_malloc(gCountAvailable * sizeof(UBool));
  const char* name;
  UErrorCode status = U_ZERO_ERROR;
  int32_t i;

  /* fill the bool for the selector results! */
  uprv_memset(resultsFromSystem, 0, gCountAvailable);
  while ((name = uenum_next(res,NULL, &status)) != NULL) {
    resultsFromSystem[findIndex(name)] = true;
  }
  for(i = 0 ; i < gCountAvailable; i++) {
    if(resultsManually[i] != resultsFromSystem[i]) {
      log_err("failure in converter selector\n"
              "converter %s had conflicting results -- manual: %d, system %d\n",
              gAvailableNames[i], resultsManually[i], resultsFromSystem[i]);
    }
  }
  uprv_free(resultsFromSystem);
  uenum_close(res);
}

static UConverterSelector *
serializeAndUnserialize(UConverterSelector *sel, char **buffer, UErrorCode *status) {
  char *new_buffer;
  int32_t ser_len, ser_len2;
  /* preflight */
  ser_len = ucnvsel_serialize(sel, NULL, 0, status);
  if (*status != U_BUFFER_OVERFLOW_ERROR) {
    log_err("ucnvsel_serialize(preflighting) failed: %s\n", u_errorName(*status));
    return sel;
  }
  new_buffer = (char *)uprv_malloc(ser_len);
  *status = U_ZERO_ERROR;
  ser_len2 = ucnvsel_serialize(sel, new_buffer, ser_len, status);
  if (U_FAILURE(*status) || ser_len != ser_len2) {
    log_err("ucnvsel_serialize() failed: %s\n", u_errorName(*status));
    uprv_free(new_buffer);
    return sel;
  }
  ucnvsel_close(sel);
  uprv_free(*buffer);
  *buffer = new_buffer;
  sel = ucnvsel_openFromSerialized(new_buffer, ser_len, status);
  if (U_FAILURE(*status)) {
    log_err("ucnvsel_openFromSerialized() failed: %s\n", u_errorName(*status));
    return NULL;
  }
  return sel;
}

static void TestSelector()
{
  TestText text;
  USet* excluded_sets[3] = { NULL };
  int32_t i, testCaseIdx;

  if (!getAvailableNames()) {
    return;
  }
  if (!text_open(&text)) {
    releaseAvailableNames();
  }

  excluded_sets[0] = uset_openEmpty();
  for(i = 1 ; i < 3 ; i++) {
    excluded_sets[i] = uset_open(i*30, i*30+500);
  }

  for(testCaseIdx = 0; testCaseIdx < UPRV_LENGTHOF(getEncodingsFns); testCaseIdx++)
  {
    int32_t excluded_set_id;
    int32_t num_encodings;
    const char **encodings = getEncodingsFns[testCaseIdx](&num_encodings);
    if (getTestOption(QUICK_OPTION) && num_encodings > 25) {
      uprv_free((void *)encodings);
      continue;
    }

    /*
     * for(excluded_set_id = 0 ; excluded_set_id < 3 ; excluded_set_id++)
     *
     * This loop was replaced by the following statement because
     * the loop made the test run longer without adding to the code coverage.
     * The handling of the exclusion set is independent of the
     * set of encodings, so there is no need to test every combination.
     */
    excluded_set_id = testCaseIdx % UPRV_LENGTHOF(excluded_sets);
    {
      UConverterSelector *sel_rt, *sel_fb;
      char *buffer_fb = NULL;
      UErrorCode status = U_ZERO_ERROR;
      sel_rt = ucnvsel_open(encodings, num_encodings,
                            excluded_sets[excluded_set_id],
                            UCNV_ROUNDTRIP_SET, &status);
      if (num_encodings == gCountAvailable) {
        /* test the special "all converters" parameter values */
        sel_fb = ucnvsel_open(NULL, 0,
                              excluded_sets[excluded_set_id],
                              UCNV_ROUNDTRIP_AND_FALLBACK_SET, &status);
      } else if (uset_isEmpty(excluded_sets[excluded_set_id])) {
        /* test that a NULL set gives the same results as an empty set */
        sel_fb = ucnvsel_open(encodings, num_encodings,
                              NULL,
                              UCNV_ROUNDTRIP_AND_FALLBACK_SET, &status);
      } else {
        sel_fb = ucnvsel_open(encodings, num_encodings,
                              excluded_sets[excluded_set_id],
                              UCNV_ROUNDTRIP_AND_FALLBACK_SET, &status);
      }
      if (U_FAILURE(status)) {
        log_err("ucnv_sel_open(encodings %ld) failed - %s\n", testCaseIdx, u_errorName(status));
        ucnvsel_close(sel_rt);
        uprv_free((void *)encodings);
        continue;
      }

      text_reset(&text);
      for (;;) {
        UBool *manual_rt, *manual_fb;
        static UChar utf16[10000];
        char *s;
        int32_t length8, length16;

        s = text_nextString(&text, &length8);
        if (s == NULL || (getTestOption(QUICK_OPTION) && text.number > 3)) {
          break;
        }

        manual_rt = getResultsManually(encodings, num_encodings,
                                       s, length8,
                                       excluded_sets[excluded_set_id],
                                       UCNV_ROUNDTRIP_SET);
        manual_fb = getResultsManually(encodings, num_encodings,
                                       s, length8,
                                       excluded_sets[excluded_set_id],
                                       UCNV_ROUNDTRIP_AND_FALLBACK_SET);
        /* UTF-8 with length */
        status = U_ZERO_ERROR;
        verifyResult(ucnvsel_selectForUTF8(sel_rt, s, length8, &status), manual_rt);
        verifyResult(ucnvsel_selectForUTF8(sel_fb, s, length8, &status), manual_fb);
        /* UTF-8 NUL-terminated */
        verifyResult(ucnvsel_selectForUTF8(sel_rt, s, -1, &status), manual_rt);
        verifyResult(ucnvsel_selectForUTF8(sel_fb, s, -1, &status), manual_fb);

        u_strFromUTF8(utf16, UPRV_LENGTHOF(utf16), &length16, s, length8, &status);
        if (U_FAILURE(status)) {
          log_err("error converting the test text (string %ld) to UTF-16 - %s\n",
                  (long)text.number, u_errorName(status));
        } else {
          if (text.number == 0) {
            sel_fb = serializeAndUnserialize(sel_fb, &buffer_fb, &status);
          }
          if (U_SUCCESS(status)) {
            /* UTF-16 with length */
            verifyResult(ucnvsel_selectForString(sel_rt, utf16, length16, &status), manual_rt);
            verifyResult(ucnvsel_selectForString(sel_fb, utf16, length16, &status), manual_fb);
            /* UTF-16 NUL-terminated */
            verifyResult(ucnvsel_selectForString(sel_rt, utf16, -1, &status), manual_rt);
            verifyResult(ucnvsel_selectForString(sel_fb, utf16, -1, &status), manual_fb);
          }
        }

        uprv_free(manual_rt);
        uprv_free(manual_fb);
      }
      ucnvsel_close(sel_rt);
      ucnvsel_close(sel_fb);
      uprv_free(buffer_fb);
    }
    uprv_free((void *)encodings);
  }

  releaseAvailableNames();
  text_close(&text);
  for(i = 0 ; i < 3 ; i++) {
    uset_close(excluded_sets[i]);
  }
}

/* Improve code coverage of UPropsVectors */
static void TestUPropsVector() {
    UErrorCode errorCode = U_ILLEGAL_ARGUMENT_ERROR;
    UPropsVectors *pv = upvec_open(100, &errorCode);
    if (pv != NULL) {
        log_err("Should have returned NULL if UErrorCode is an error.");
        return;
    }
    errorCode = U_ZERO_ERROR;
    pv = upvec_open(-1, &errorCode);
    if (pv != NULL || U_SUCCESS(errorCode)) {
        log_err("Should have returned NULL if column is less than 0.\n");
        return;
    }
    errorCode = U_ZERO_ERROR;
    pv = upvec_open(100, &errorCode);
    if (pv == NULL || U_FAILURE(errorCode)) {
        log_err("Unable to open UPropsVectors.\n");
        return;
    }

    if (upvec_getValue(pv, 0, 1) != 0) {
        log_err("upvec_getValue should return 0.\n");
    }
    if (upvec_getRow(pv, 0, NULL, NULL) == NULL) {
        log_err("upvec_getRow should not return NULL.\n");
    }
    if (upvec_getArray(pv, NULL, NULL) != NULL) {
        log_err("upvec_getArray should return NULL.\n");
    }

    upvec_close(pv);
}