/*---------------------------------------------------------------------------*
 *  text_parser.c  *
 *                                                                           *
 *  Copyright 2007, 2008 Nuance Communciations, Inc.                               *
 *                                                                           *
 *  Licensed under the Apache License, Version 2.0 (the 'License');          *
 *  you may not use this file except in compliance with the License.         *
 *                                                                           *
 *  You may obtain a copy of the License at                                  *
 *      http://www.apache.org/licenses/LICENSE-2.0                           *
 *                                                                           *
 *  Unless required by applicable law or agreed to in writing, software      *
 *  distributed under the License is distributed on an 'AS IS' BASIS,        *
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * 
 *  See the License for the specific language governing permissions and      *
 *  limitations under the License.                                           *
 *                                                                           *
 *---------------------------------------------------------------------------*/

#include"pstdio.h"
#include"srec_context.h"
#include"astar.h"

#include "passert.h"
#include "portable.h"


#define MAX_LOCAL_LEN 256
#define PARSE_PASS 0
#define PARSE_FAIL 1


static int check_word_path(srec_context* context, arc_token* atok,
                           const char* transcription, int tlen)
{
  const char    *wd, *p;
  char          *q;
  arc_token*    next_atok;
  wordID        wdID;
  int           q_position;

  if ( strlen ( transcription ) >= MAX_LOCAL_LEN - 1)
  {
    PLogError("Transcription too long [%s]\n", transcription);
    return PARSE_FAIL;
  }

  while (1) {
    char copy_of_word[MAX_LOCAL_LEN]; /* save heap on recursive function */
    
    /* wd points to the first char of last word */
    wd = transcription;
    if (tlen > 0)
    {
      for (wd = transcription + tlen - 1; wd > transcription; wd--)
      {
        if (*wd == ' ')
        {
          wd++;
          break;
        }
      }
    }
    for (p = wd, q = copy_of_word; ; p++, q++)
    {
      q_position = q - copy_of_word;
      if (q_position < 0 || (size_t)q_position >= MAX_LOCAL_LEN)
      {
        PLogError("Word too long in transcription [%s]\n", transcription);
        return PARSE_FAIL;
      }
      *q = *p;
      if (*p == ' ' || *p == '\0')
      {
        *q = 0;
        break;
      }
    }
    wdID = wordmap_find_index(context->olabels, copy_of_word);
    
    if (wdID < MAXwordID)
    {
      next_atok = get_arc_for_word(atok, wdID, context, context->beg_silence_word);
    }
    else
    {
      next_atok = get_arc_for_word_without_slot_annotation(atok, wd, context, context->beg_silence_word);
      if (!next_atok) return PARSE_FAIL;
    }
    
    if (!next_atok) return PARSE_FAIL;
  
    int whether_final_atok = 0;
    arc_token* tmp;
    for (tmp = ARC_TOKEN_PTR(context->arc_token_list, next_atok->first_next_arc); tmp != NULL;
         tmp = ARC_TOKEN_PTR(context->arc_token_list, tmp->next_token_index))
    {
      if (tmp->ilabel == MAXwordID) whether_final_atok = 1;
    }
    
    if (wd == transcription && whether_final_atok) return PARSE_PASS;
    if (wd == transcription) return PARSE_FAIL;
    tlen--;
    while (transcription[tlen] != ' ' && tlen > 0) tlen--;
  
    atok = next_atok;
  }
}

int FST_CheckPath_Simple(srec_context* context, const char* transcription)
{
  arc_token* atok = &context->arc_token_list[0];
  int transcription_len = strlen(transcription);
  int rc;
  
  for (; transcription_len > 0; transcription_len--)
    if (transcription[transcription_len-1] != ' ') break;
  rc = check_word_path(context, atok, transcription, transcription_len);
  return rc;
}

int FST_CheckPath_Complex(srec_context* context, const char* transcription,
                          char* literal, size_t max_literal_len)
{
  int i, j, rc;
  int num_spaces;
  char copy_of_transcription[MAX_LOCAL_LEN];
  char* spaces[24], *p; /* can't go too high here!! */
  ASSERT(strlen(transcription) < MAX_LOCAL_LEN);
  
  strcpy(copy_of_transcription, transcription);
  for (num_spaces = 0, p = copy_of_transcription; *p; p++)
  {
    if (*p == ' ')
    {
      if ((size_t)num_spaces >= sizeof(spaces) / sizeof(char*))
      {
        PLogError("FST_CheckPath_Complex() failed on too many words\n");
        return PARSE_FAIL;
      }
      spaces[num_spaces++] = p;
    }
  }
  
  if (num_spaces == 0)
  {
    rc = FST_CheckPath_Simple(context, transcription);
    if (rc == PARSE_PASS)
    {
      ASSERT(strlen(copy_of_transcription) < max_literal_len);
      strcpy(literal, copy_of_transcription);
    }
    return rc;
  }
  
  for (i = 0; i < (1 << num_spaces); i++)
  {
    /* find the space pointers */
    for (j = 0; j < num_spaces; j++)
      *spaces[j] = i & (1 << j) ? '_' : ' ';
    /* check each word, potentially within a rule! */
    for (p = strtok(copy_of_transcription, " "); p; p = strtok(NULL, " "))
    {
      wordID k, wdid = wordmap_find_index(context->olabels, p);
      if (wdid < MAXwordID) continue;
      for (k = 1; k < context->olabels->num_slots; k++)
      {
        wdid = wordmap_find_index_in_rule(context->olabels, p, k);
        if (wdid < MAXwordID) break;
      }
      if (wdid == MAXwordID)
        goto next_i;
    }
    /* fix the nulls back */
    for (j = 0; j < num_spaces; j++)
      *spaces[j] = i & (1 << j) ? '_' : ' ';
    rc = FST_CheckPath_Simple(context, copy_of_transcription);
    if (rc == PARSE_PASS)
    {
      ASSERT(strlen(copy_of_transcription) < max_literal_len);
      strcpy(literal, copy_of_transcription);
      return rc;
    }
next_i:
    continue;
  }
  return PARSE_FAIL;
}

static void clean_up_sentence(char* s);

int FST_CheckPath(srec_context* context, const char* transcription,
                  char* literal, size_t max_literal_len)
{
  char mytranscription[256];
  passert(strlen(transcription) < sizeof(mytranscription));
  strcpy(mytranscription, transcription);
  clean_up_sentence(mytranscription);
  if (!context->arc_token_list)
    return 2;
  else
    return FST_CheckPath_Complex(context, mytranscription, literal, max_literal_len);
}

static void clean_up_sentence(char* s)
{
  char* p, *q;
  if (0) printf("sentence: '%s'\n", s);
  /* change speech codes to spaces */
  for (p = s; *p; p++)
  {
    if (*p == '[')
      for (;*p && *p != ']'; p++)
        *p = ' ';
    if (*p == ']') *p = ' ';
  }
  /* trim leading spaces */
  for (p = s; *p == ' ';)
    for (q = p; *q; q++) *q = *(q + 1);
  /* trim middle spaces */
  for (p = s; p && *p;)
  {
    if (!*p) break;
    p = strchr(p, ' ');
    if (!p) break;
    for (;*(p + 1) == ' ';)
      for (q = p; *q; q++) *q = *(q + 1);
    p++;
  }
  /* trim ending spaces */
  for (p = s + strlen(s); p != s;)
    if (*(--p) == ' ') *p = 0;
    else break;
    
  if (0) printf("clean_sentence: '%s'\n", s);
}