/*---------------------------------------------------------------------------* * text_parser.c * * * * Copyright 2007, 2008 Nuance Communciations, Inc. * * * * Licensed under the Apache License, Version 2.0 (the 'License'); * * you may not use this file except in compliance with the License. * * * * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * * * Unless required by applicable law or agreed to in writing, software * * distributed under the License is distributed on an 'AS IS' BASIS, * * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * * See the License for the specific language governing permissions and * * limitations under the License. * * * *---------------------------------------------------------------------------*/ #include"pstdio.h" #include"srec_context.h" #include"astar.h" #include "passert.h" #include "portable.h" #define MAX_LOCAL_LEN 256 #define PARSE_PASS 0 #define PARSE_FAIL 1 static int check_word_path(srec_context* context, arc_token* atok, const char* transcription, int tlen) { const char *wd, *p; char *q; arc_token* next_atok; wordID wdID; int q_position; if ( strlen ( transcription ) >= MAX_LOCAL_LEN - 1) { PLogError("Transcription too long [%s]\n", transcription); return PARSE_FAIL; } while (1) { char copy_of_word[MAX_LOCAL_LEN]; /* save heap on recursive function */ /* wd points to the first char of last word */ wd = transcription; if (tlen > 0) { for (wd = transcription + tlen - 1; wd > transcription; wd--) { if (*wd == ' ') { wd++; break; } } } for (p = wd, q = copy_of_word; ; p++, q++) { q_position = q - copy_of_word; if (q_position < 0 || (size_t)q_position >= MAX_LOCAL_LEN) { PLogError("Word too long in transcription [%s]\n", transcription); return PARSE_FAIL; } *q = *p; if (*p == ' ' || *p == '\0') { *q = 0; break; } } wdID = wordmap_find_index(context->olabels, copy_of_word); if (wdID < MAXwordID) { next_atok = get_arc_for_word(atok, wdID, context, context->beg_silence_word); } else { next_atok = get_arc_for_word_without_slot_annotation(atok, wd, context, context->beg_silence_word); if (!next_atok) return PARSE_FAIL; } if (!next_atok) return PARSE_FAIL; int whether_final_atok = 0; arc_token* tmp; for (tmp = ARC_TOKEN_PTR(context->arc_token_list, next_atok->first_next_arc); tmp != NULL; tmp = ARC_TOKEN_PTR(context->arc_token_list, tmp->next_token_index)) { if (tmp->ilabel == MAXwordID) whether_final_atok = 1; } if (wd == transcription && whether_final_atok) return PARSE_PASS; if (wd == transcription) return PARSE_FAIL; tlen--; while (transcription[tlen] != ' ' && tlen > 0) tlen--; atok = next_atok; } } int FST_CheckPath_Simple(srec_context* context, const char* transcription) { arc_token* atok = &context->arc_token_list[0]; int transcription_len = strlen(transcription); int rc; for (; transcription_len > 0; transcription_len--) if (transcription[transcription_len-1] != ' ') break; rc = check_word_path(context, atok, transcription, transcription_len); return rc; } int FST_CheckPath_Complex(srec_context* context, const char* transcription, char* literal, size_t max_literal_len) { int i, j, rc; int num_spaces; char copy_of_transcription[MAX_LOCAL_LEN]; char* spaces[24], *p; /* can't go too high here!! */ ASSERT(strlen(transcription) < MAX_LOCAL_LEN); strcpy(copy_of_transcription, transcription); for (num_spaces = 0, p = copy_of_transcription; *p; p++) { if (*p == ' ') { if ((size_t)num_spaces >= sizeof(spaces) / sizeof(char*)) { PLogError("FST_CheckPath_Complex() failed on too many words\n"); return PARSE_FAIL; } spaces[num_spaces++] = p; } } if (num_spaces == 0) { rc = FST_CheckPath_Simple(context, transcription); if (rc == PARSE_PASS) { ASSERT(strlen(copy_of_transcription) < max_literal_len); strcpy(literal, copy_of_transcription); } return rc; } for (i = 0; i < (1 << num_spaces); i++) { /* find the space pointers */ for (j = 0; j < num_spaces; j++) *spaces[j] = i & (1 << j) ? '_' : ' '; /* check each word, potentially within a rule! */ for (p = strtok(copy_of_transcription, " "); p; p = strtok(NULL, " ")) { wordID k, wdid = wordmap_find_index(context->olabels, p); if (wdid < MAXwordID) continue; for (k = 1; k < context->olabels->num_slots; k++) { wdid = wordmap_find_index_in_rule(context->olabels, p, k); if (wdid < MAXwordID) break; } if (wdid == MAXwordID) goto next_i; } /* fix the nulls back */ for (j = 0; j < num_spaces; j++) *spaces[j] = i & (1 << j) ? '_' : ' '; rc = FST_CheckPath_Simple(context, copy_of_transcription); if (rc == PARSE_PASS) { ASSERT(strlen(copy_of_transcription) < max_literal_len); strcpy(literal, copy_of_transcription); return rc; } next_i: continue; } return PARSE_FAIL; } static void clean_up_sentence(char* s); int FST_CheckPath(srec_context* context, const char* transcription, char* literal, size_t max_literal_len) { char mytranscription[256]; passert(strlen(transcription) < sizeof(mytranscription)); strcpy(mytranscription, transcription); clean_up_sentence(mytranscription); if (!context->arc_token_list) return 2; else return FST_CheckPath_Complex(context, mytranscription, literal, max_literal_len); } static void clean_up_sentence(char* s) { char* p, *q; if (0) printf("sentence: '%s'\n", s); /* change speech codes to spaces */ for (p = s; *p; p++) { if (*p == '[') for (;*p && *p != ']'; p++) *p = ' '; if (*p == ']') *p = ' '; } /* trim leading spaces */ for (p = s; *p == ' ';) for (q = p; *q; q++) *q = *(q + 1); /* trim middle spaces */ for (p = s; p && *p;) { if (!*p) break; p = strchr(p, ' '); if (!p) break; for (;*(p + 1) == ' ';) for (q = p; *q; q++) *q = *(q + 1); p++; } /* trim ending spaces */ for (p = s + strlen(s); p != s;) if (*(--p) == ' ') *p = 0; else break; if (0) printf("clean_sentence: '%s'\n", s); }