diff -crN chasen-2.2.3/lib/Makefile chasen-2.2.4/lib/Makefile *** chasen-2.2.3/lib/Makefile Wed Apr 18 10:34:28 2001 --- chasen-2.2.4/lib/Makefile Wed Apr 18 10:34:51 2001 *************** *** 71,82 **** OBJDUMP = @OBJDUMP@ PACKAGE = chasen RANLIB = ranlib ! VERSION = 2.2.3 include_HEADERS = chasen.h lib_LTLIBRARIES = libchasen.la ! libchasen_la_SOURCES = chadic.h chalib.c chalib.h chfile.c connect.c getid.c getopt.c grammar.c init.c iotool.c jfgets.c katuyou.c lisp.c parse.c pat.c pat.h patfile.c print.c select.c sufary.h zentohan.c mmap.c dic.c libchasen_la_LDFLAGS = -version-info $(LTVERSION) --- 71,82 ---- OBJDUMP = @OBJDUMP@ PACKAGE = chasen RANLIB = ranlib ! VERSION = 2.2.4 include_HEADERS = chasen.h lib_LTLIBRARIES = libchasen.la ! libchasen_la_SOURCES = chadic.h chalib.c chalib.h chfile.c connect.c getid.c getopt.c grammar.c init.c iotool.c jfgets.c katuyou.c lisp.c tokenizer.c parse.c pat.c pat.h patfile.c print.c select.c sufary.h zentohan.c mmap.c dic.c tokenizer.h htobe.c htobe.h libchasen_la_LDFLAGS = -version-info $(LTVERSION) *************** *** 94,101 **** libchasen_la_LIBADD = libchasen_la_OBJECTS = chalib.lo chfile.lo connect.lo getid.lo \ getopt.lo grammar.lo init.lo iotool.lo jfgets.lo katuyou.lo lisp.lo \ ! parse.lo pat.lo patfile.lo print.lo select.lo zentohan.lo mmap.lo \ ! dic.lo CFLAGS = -g -O2 -Wall COMPILE = $(CC) $(DEFS) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) LTCOMPILE = $(LIBTOOL) --mode=compile $(CC) $(DEFS) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) --- 94,101 ---- libchasen_la_LIBADD = libchasen_la_OBJECTS = chalib.lo chfile.lo connect.lo getid.lo \ getopt.lo grammar.lo init.lo iotool.lo jfgets.lo katuyou.lo lisp.lo \ ! tokenizer.lo parse.lo pat.lo patfile.lo print.lo select.lo zentohan.lo \ ! mmap.lo dic.lo htobe.lo CFLAGS = -g -O2 -Wall COMPILE = $(CC) $(DEFS) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) LTCOMPILE = $(LIBTOOL) --mode=compile $(CC) $(DEFS) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) diff -crN chasen-2.2.3/lib/Makefile.am chasen-2.2.4/lib/Makefile.am *** chasen-2.2.3/lib/Makefile.am Sat Feb 24 09:36:44 2001 --- chasen-2.2.4/lib/Makefile.am Fri Mar 16 13:06:17 2001 *************** *** 3,11 **** libchasen_la_SOURCES = chadic.h chalib.c chalib.h chfile.c connect.c \ getid.c getopt.c grammar.c init.c iotool.c \ ! jfgets.c katuyou.c lisp.c parse.c \ pat.c pat.h patfile.c print.c select.c sufary.h \ ! zentohan.c mmap.c dic.c libchasen_la_LDFLAGS = -version-info $(LTVERSION) --- 3,11 ---- libchasen_la_SOURCES = chadic.h chalib.c chalib.h chfile.c connect.c \ getid.c getopt.c grammar.c init.c iotool.c \ ! jfgets.c katuyou.c lisp.c tokenizer.c parse.c \ pat.c pat.h patfile.c print.c select.c sufary.h \ ! zentohan.c mmap.c dic.c tokenizer.h htobe.c htobe.h libchasen_la_LDFLAGS = -version-info $(LTVERSION) diff -crN chasen-2.2.3/lib/Makefile.in chasen-2.2.4/lib/Makefile.in *** chasen-2.2.3/lib/Makefile.in Sat Feb 24 09:39:21 2001 --- chasen-2.2.4/lib/Makefile.in Tue Mar 20 05:17:55 2001 *************** *** 76,82 **** include_HEADERS = chasen.h lib_LTLIBRARIES = libchasen.la ! libchasen_la_SOURCES = chadic.h chalib.c chalib.h chfile.c connect.c getid.c getopt.c grammar.c init.c iotool.c jfgets.c katuyou.c lisp.c parse.c pat.c pat.h patfile.c print.c select.c sufary.h zentohan.c mmap.c dic.c libchasen_la_LDFLAGS = -version-info $(LTVERSION) --- 76,82 ---- include_HEADERS = chasen.h lib_LTLIBRARIES = libchasen.la ! libchasen_la_SOURCES = chadic.h chalib.c chalib.h chfile.c connect.c getid.c getopt.c grammar.c init.c iotool.c jfgets.c katuyou.c lisp.c tokenizer.c parse.c pat.c pat.h patfile.c print.c select.c sufary.h zentohan.c mmap.c dic.c tokenizer.h htobe.c htobe.h libchasen_la_LDFLAGS = -version-info $(LTVERSION) *************** *** 94,101 **** libchasen_la_LIBADD = libchasen_la_OBJECTS = chalib.lo chfile.lo connect.lo getid.lo \ getopt.lo grammar.lo init.lo iotool.lo jfgets.lo katuyou.lo lisp.lo \ ! parse.lo pat.lo patfile.lo print.lo select.lo zentohan.lo mmap.lo \ ! dic.lo CFLAGS = @CFLAGS@ COMPILE = $(CC) $(DEFS) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) LTCOMPILE = $(LIBTOOL) --mode=compile $(CC) $(DEFS) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) --- 94,101 ---- libchasen_la_LIBADD = libchasen_la_OBJECTS = chalib.lo chfile.lo connect.lo getid.lo \ getopt.lo grammar.lo init.lo iotool.lo jfgets.lo katuyou.lo lisp.lo \ ! tokenizer.lo parse.lo pat.lo patfile.lo print.lo select.lo zentohan.lo \ ! mmap.lo dic.lo htobe.lo CFLAGS = @CFLAGS@ COMPILE = $(CC) $(DEFS) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) LTCOMPILE = $(LIBTOOL) --mode=compile $(CC) $(DEFS) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) diff -crN chasen-2.2.3/lib/chadic.h chasen-2.2.4/lib/chadic.h *** chasen-2.2.3/lib/chadic.h Wed Feb 14 09:20:52 2001 --- chasen-2.2.4/lib/chadic.h Fri Mar 16 15:13:57 2001 *************** *** 1,39 **** /* ! ============================================================================== ! chadic.h ! 1990/12/06/Thu Yutaka MYOKI(Nagao Lab., KUEE) ! 1990/01/09/Wed Last Modified ! >>> 94/02 changed by T.Nakamura <<< ! ============================================================================== ! */ #ifndef __CHADIC_H__ #define __CHADIC_H__ #include #include #include #include #include ! #include ! #if defined _WIN32 && ! defined __CYGWIN__ ! #include ! #else ! #include ! #endif ! #include - #include - #include - - #include "config.h" #include "pat.h" #include "sufary.h" - /* - grep * macros - */ - #ifndef FALSE #define FALSE ((int)0) #endif --- 1,28 ---- /* ! * chadic.h ! * 1990/12/06/Thu Yutaka MYOKI(Nagao Lab., KUEE) ! * ! * $Id: chadic.h,v 1.18 2001/03/16 06:13:57 kazuma-t Exp $ ! */ ! #ifndef __CHADIC_H__ #define __CHADIC_H__ + #include "config.h" + #include + #include #include #include #include + #ifdef HAVE_UNISTD_H #include ! #endif /* HAVE_UNISTD_H */ #include "pat.h" #include "sufary.h" #ifndef FALSE #define FALSE ((int)0) #endif *************** *** 48,60 **** #define PATH_DELIMITER '/' #endif - /* fseek() */ - #ifndef SEEK_SET - #define SEEK_SET 0 - #define SEEK_CUR 1 - #define SEEK_END 2 - #endif - #define CHAINT_OFFSET 11 #define CHAINT_SCALE (256-CHAINT_OFFSET) --- 37,42 ---- *************** *** 75,82 **** #define ESTR_BOS_EOS "BOS/EOS" /* cforms.cha */ ! #define JSTR_BASIC_FORM "基本形" ! #define ESTR_BASIC_FORM "BASIC" /* *.dic */ #define JSTR_DEF_POS_COST "デフォルト品詞コスト" --- 57,65 ---- #define ESTR_BOS_EOS "BOS/EOS" /* cforms.cha */ ! #define JSTR_BASE_FORM "基本形" ! #define ESTR_BASE_FORM1 "BASEFORM" ! #define ESTR_BASE_FORM2 "STEMFORM" /* *.dic */ #define JSTR_DEF_POS_COST "デフォルト品詞コスト" *************** *** 100,107 **** #define JSTR_INFO1 "付加情報" #define JSTR_INFO2 "意味情報" #define ESTR_INFO "INFO" ! #define JSTR_COMPO "複合語" ! #define ESTR_COMPO "COMP" #define JSTR_SEG "構成語" #define ESTR_SEG "SEG" #define JSTR_CONN_ATTR "連接属性" --- 83,90 ---- #define JSTR_INFO1 "付加情報" #define JSTR_INFO2 "意味情報" #define ESTR_INFO "INFO" ! #define JSTR_COMPOUND "複合語" ! #define ESTR_COMPOUND "COMPOUND" #define JSTR_SEG "構成語" #define ESTR_SEG "SEG" #define JSTR_CONN_ATTR "連接属性" *************** *** 130,139 **** #define ESTR_COST_WIDTH "COST_WIDTH" #define JSTR_DEF_CONN_COST "未定義連接コスト" #define ESTR_DEF_CONN_COST "DEF_CONN_COST" ! #define JSTR_COMPO_POS "連結品詞" ! #define ESTR_COMPO_POS "COMPOSIT_POS" ! #define JSTR_OUTPUT_COMPO "複合語出力" ! #define ESTR_OUTPUT_COMPO "OUTPUT_COMPOUND" #define ESTR_PAT_FILE "PATDIC" /* changed by Tatuo 960920 */ #define ESTR_SUF_FILE "SUFDIC" #define JSTR_OUTPUT_FORMAT "出力フォーマット" --- 113,122 ---- #define ESTR_COST_WIDTH "COST_WIDTH" #define JSTR_DEF_CONN_COST "未定義連接コスト" #define ESTR_DEF_CONN_COST "DEF_CONN_COST" ! #define JSTR_COMPOSIT_POS "連結品詞" ! #define ESTR_COMPOSIT_POS "COMPOSIT_POS" ! #define JSTR_OUTPUT_COMPOUND "複合語出力" ! #define ESTR_OUTPUT_COMPOUND "OUTPUT_COMPOUND" #define ESTR_PAT_FILE "PATDIC" /* changed by Tatuo 960920 */ #define ESTR_SUF_FILE "SUFDIC" #define JSTR_OUTPUT_FORMAT "出力フォーマット" *************** *** 190,205 **** #define strmatch2(s,s1,s2) (!strcmp(s,s1)||!strcmp(s,s2)) #define strmatch3(s,s1,s2,s3) (!strcmp(s,s1)||!strcmp(s,s2)||!strcmp(s,s3)) /* rensetu matrix */ typedef struct _connect_rule_t { unsigned short next; unsigned short cost; } connect_rule_t; - /* - * structures - */ - /* 部と 部へのポインタで表現されたセル */ typedef struct _bin_t { void *cha_car; /* address of */ --- 173,188 ---- #define strmatch2(s,s1,s2) (!strcmp(s,s1)||!strcmp(s,s2)) #define strmatch3(s,s1,s2,s3) (!strcmp(s,s1)||!strcmp(s,s2)||!strcmp(s,s3)) + /* + * structures + */ + /* rensetu matrix */ typedef struct _connect_rule_t { unsigned short next; unsigned short cost; } connect_rule_t; /* 部と 部へのポインタで表現されたセル */ typedef struct _bin_t { void *cha_car; /* address of */ *************** *** 216,224 **** } chasen_cell_t; /* this structure is used only in mkchadic */ - /* changed by T.Nakamura and S.Kurohashi - 構造体 mrph_t がすべての情報を持ち, - 構造体 MORPHEME はなくなった */ /* morpheme */ typedef struct _mrph { char midasi[MIDASI_LEN]; /* surface form */ --- 199,204 ---- *************** *** 237,279 **** char is_undef; /* the unseen word or not */ } mrph_t; ! /* POS information */ typedef struct _hinsi_t { ! short *path; ! short *daughter; ! char *name; ! char *bkugiri; ! short comp; /* 連結品詞番号 */ ! char depth; ! char kt; unsigned char cost; } hinsi_t; /* 活用型 conjugation type */ typedef struct _ktype { ! char *name; ! short basic; } ktype_t; /* 活用形 conjugation form */ typedef struct _kform { ! char *name; ! char *gobi; ! int gobi_len; ! char *ygobi; ! char *pgobi; } kform_t; /* 連接表 connection matrix */ typedef struct _rensetu_pair { short index; ! short i_pos; ! short j_pos; ! unsigned short hinsi; ! unsigned char type; ! unsigned char form; ! char *goi; } rensetu_pair_t; /* --- 217,259 ---- char is_undef; /* the unseen word or not */ } mrph_t; ! /* POS information -- see also the comments (the end of this file) */ typedef struct _hinsi_t { ! short *path; /* the path to top node */ ! short *daughter; /* the daughter node */ ! char *name; /* the name of POS (at the level) */ ! char *bkugiri; /* for bunsetsu segmentation */ ! short composit; /* for the COMPOSIT_POS */ ! char depth; /* the depth from top node */ ! char kt; /* have conjugation or not */ unsigned char cost; } hinsi_t; /* 活用型 conjugation type */ typedef struct _ktype { ! char *name; /* CTYPE name */ ! short basic; /* base form */ } ktype_t; /* 活用形 conjugation form */ typedef struct _kform { ! char *name; /* CFORM name */ ! char *gobi; /* suffix of surface form */ ! int gobi_len; /* the length of suffix */ ! char *ygobi; /* suffix of Japanese reading */ ! char *pgobi; /* suffix of Japanese pronunciation */ } kform_t; /* 連接表 connection matrix */ typedef struct _rensetu_pair { short index; ! short i_pos; /* the POS index in the current state (= preceding morpheme) */ ! short j_pos; /* the POS index in the input (= current morpheme) */ ! unsigned short hinsi; /* POS */ ! unsigned char type; /* CTYPE */ ! unsigned char form; /* CFORM */ ! char *goi; /* Lexicalized POS */ } rensetu_pair_t; /* *************** *** 281,287 **** */ #define HINSI_MAX 4096 ! extern hinsi_t Cha_hinsi[HINSI_MAX]; extern ktype_t Cha_type[TYPE_NUM]; extern kform_t Cha_form[TYPE_NUM][FORM_NUM]; extern int Cha_lineno, Cha_lineno_error; --- 261,267 ---- */ #define HINSI_MAX 4096 ! extern hinsi_t Cha_hinsi[HINSI_MAX]; /* see also the comments (the end of this file) */ extern ktype_t Cha_type[TYPE_NUM]; extern kform_t Cha_form[TYPE_NUM][FORM_NUM]; extern int Cha_lineno, Cha_lineno_error; *************** *** 380,385 **** --- 360,398 ---- /* mmap.c */ off_t cha_mmap_file(char*, void**); + off_t cha_mmap_file_w(char*, void**); void cha_munmap_file(void*, off_t); #endif /* __CHADIC_H__ */ + + + /* + the data format of the structure hinsi_t + the POS informations are treated in global valuable Cha_hinsi[n] + + ============= =================== + "grammar.cha" "real POS tag list" + ============= =================== + (A1 ; Cha_hinsi[1] + (B1) ; Cha_hinsi[2] A1-B1 ; Cha_hinsi[2] + (B2 ; Cha_hinsi[3] + (C1) ; Cha_hinsi[4] A1-B2-C1 ; Cha_hinsi[4] + (C2 ; Cha_hinsi[5] + (D1) ; Cha_hinsi[6] A1-B2-C2-D1 ; Cha_hinsi[6] + (D2) ; Cha_hinsi[7] A1-B2-C2-D2 ; Cha_hinsi[7] + (D3)) ; Cha_hinsi[8] A1-B2-C2-D3 ; Cha_hinsi[8] + (C3) ; Cha_hinsi[9] A1-B2-C3 ; Cha_hinsi[9] + (C4 ; Cha_hinsi[10] + (D4) ; Cha_hinsi[11] A1-B2-C4-D4 ; Cha_hinsi[11] + (D5)))) ; Cha_hinsi[12] A1-B2-C4-D5 ; Cha_hinsi[12] + + ========================================= + *hinsi_t Cha_hinsi[HINSI] for the example + ========================================= + n (idx) = 1 2 3 4 5 6 7 8 9 10 11 12 + Cha_hinsi[n].name = A1 B1 B2 C1 C2 D1 D2 D3 C3 C4 D4 D5 + Cha_hinsi[n].depth = 1 2 2 3 3 4 4 4 3 3 4 4 + *Cha_hinsi[n].daughter = 2 0 4 0 6 0 0 0 0 11 0 0 + *Cha_hinsi[n].path = 1 1 1 1 1 1 1 1 1 1 1 1 + + */ diff -crN chasen-2.2.3/lib/chalib.c chasen-2.2.4/lib/chalib.c *** chasen-2.2.3/lib/chalib.c Fri Feb 23 20:50:10 2001 --- chasen-2.2.4/lib/chalib.c Fri Mar 16 06:25:48 2001 *************** *** 33,61 **** * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * modified by A.Kitauchi , Sep. 1996 ! * $Id: chalib.c,v 1.14 2001/02/23 11:50:10 kazuma-t Exp $ */ #include "chalib.h" #include "pat.h" #include "sufary.h" - #ifdef KOCHA - #define CHA_NAME "KoCha" - #else #define CHA_NAME "ChaSen" - #endif int Cha_cost_width = -1; ! int Cha_lang_j = 0, Cha_lang_e = 0; ! int Cha_encode = CHA_ENCODE_EUC; static int cost_width0; static char patdic_filename[PAT_DIC_NUM][CHA_FILENAME_LEN]; static char sufdic_filename[PAT_DIC_NUM][CHA_FILENAME_LEN]; ! static int obj_dic_no = 0; /* 動的処理(追加)の対象となる辞書の番号 */ static int opt_show = 'b', opt_form = 'f', opt_ja, opt_cmd, opt_nobk; static char *opt_form_string; --- 33,59 ---- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * modified by A.Kitauchi , Sep. 1996 ! * $Id: chalib.c,v 1.18 2001/03/15 21:25:48 masayu-a Exp $ */ #include "chalib.h" #include "pat.h" #include "sufary.h" + #include "tokenizer.h" #define CHA_NAME "ChaSen" int Cha_cost_width = -1; ! enum cha_lang Cha_lang = CHASEN_LANG_JA; ! enum cha_encode Cha_encode = CHASEN_ENCODE_EUC; ! chasen_tok_t *Cha_tokenizer; static int cost_width0; static char patdic_filename[PAT_DIC_NUM][CHA_FILENAME_LEN]; static char sufdic_filename[PAT_DIC_NUM][CHA_FILENAME_LEN]; ! static int obj_dic_no = 0; /* 動的処理(追加)の対象となる辞書の番号 */ static int opt_show = 'b', opt_form = 'f', opt_ja, opt_cmd, opt_nobk; static char *opt_form_string; *************** *** 63,74 **** /* * cha_version() */ ! void cha_version(FILE *fp) { if (!fp) ! return; ! fprintf(fp, "%s version %s (c) 1996-2001 Nara Institute of Science and Technology\n", CHA_NAME, VERSION); fprintf(fp, "Grammar files are in ChaSen's new v-gram format.\n"); } --- 61,74 ---- /* * cha_version() */ ! void ! cha_version(FILE * fp) { if (!fp) ! return; ! fprintf(fp, ! "%s version %s (c) 1996-2001 Nara Institute of Science and Technology\n", CHA_NAME, VERSION); fprintf(fp, "Grammar files are in ChaSen's new v-gram format.\n"); } *************** *** 76,155 **** /* * cha_set_opt_form() */ ! void cha_set_opt_form(char *format) { char *f; ! #if 0 ! if (opt_form_string != NULL) ! free(opt_form_string); ! #endif ! ! /* -[fecdv] */ if (format && ! format[0] == '-' && strchr("fecdv", format[1]) && format[2] == '\0') { opt_form = format[1]; format = NULL; } if (format == NULL) { if (opt_form == 'd' || opt_form == 'v') ! opt_show = 'm'; ! switch (opt_form) { ! case 'd': ! opt_form_string = "morph(%pi,%ps,%pe,%pc,'%m','%U(%y)','%M',%U(%P'),NIL,%T0,%F0,'%I0',%c,[%ppc,],[%ppi,])"; ! break; ! case 'v': ! opt_form_string = "%pb%3pi %3ps %3pe %5pc %m\t%U(%y)\t%U(%a)\t%M\t%U(%P-) NIL %T0 %F0 %I0 %c %ppi, %ppc,\n"; break; ! case 'f': opt_form_string = "%m\t%y\t%M\t%U(%P-)\t%T \t%F \n"; break; ! case 'e': opt_form_string = "%m\t%U(%y)\t%M\t%P- %h %T* %t %F* %f\n"; break; ! case 'c': opt_form_string = "%m\t%y\t%M\t%h %t %f\n"; break; } return; } ! /* format string */ opt_form_string = format; ! /* opt_form_string = cha_convert_escape(cha_strdup(format), 1); */ f = opt_form_string + strlen(opt_form_string); if (f[-1] == '\n') ! opt_form = 'F'; else ! opt_form = 'W'; } /* * cha_set_language() */ ! void cha_set_language(char *langstr) { ! char *s; ! Cha_lang_j = Cha_lang_e = 0; ! for (s = langstr; *s; s++) { ! if (*s == 'j') ! Cha_lang_j = 1; ! else if (*s == 'e') ! Cha_lang_e = 1; } } /* * cha_set_cost_width() */ ! void cha_set_cost_width(int cw) { cost_width0 = cw * MRPH_DEFAULT_WEIGHT; ! /* 最適解以外も表示するときは Cha_cost_width を生かす */ Cha_cost_width = opt_show == 'b' ? -1 : cost_width0; } --- 76,162 ---- /* * cha_set_opt_form() */ ! void ! cha_set_opt_form(char *format) { char *f; ! /* ! * -[fecdv] ! */ if (format && ! format[0] == '-' && strchr("fecdv", format[1]) ! && format[2] == '\0') { opt_form = format[1]; format = NULL; } if (format == NULL) { if (opt_form == 'd' || opt_form == 'v') ! opt_show = 'm'; ! switch (opt_form) { ! case 'd': ! opt_form_string = ! "morph(%pi,%ps,%pe,%pc,'%m','%U(%y)','%M',%U(%P'),NIL,%T0,%F0,'%I0',%c,[%ppc,],[%ppi,])"; ! break; ! case 'v': ! opt_form_string = ! "%pb%3pi %3ps %3pe %5pc %m\t%U(%y)\t%U(%a)\t%M\t%U(%P-) NIL %T0 %F0 %I0 %c %ppi, %ppc,\n"; break; ! case 'f': opt_form_string = "%m\t%y\t%M\t%U(%P-)\t%T \t%F \n"; break; ! case 'e': opt_form_string = "%m\t%U(%y)\t%M\t%P- %h %T* %t %F* %f\n"; break; ! case 'c': opt_form_string = "%m\t%y\t%M\t%h %t %f\n"; break; } return; } ! /* ! * format string ! */ opt_form_string = format; ! /* ! * opt_form_string = cha_convert_escape(cha_strdup(format), 1); ! */ f = opt_form_string + strlen(opt_form_string); if (f[-1] == '\n') ! opt_form = 'F'; else ! opt_form = 'W'; } /* * cha_set_language() */ ! void ! cha_set_language(char *langstr) { ! Cha_lang = CHASEN_LANG_JA; ! if (langstr[0] == 'j') { ! Cha_lang = CHASEN_LANG_JA; ! } else if (langstr[0] == 'e') { ! Cha_lang = CHASEN_LANG_EN; } } /* * cha_set_cost_width() */ ! void ! cha_set_cost_width(int cw) { cost_width0 = cw * MRPH_DEFAULT_WEIGHT; ! /* ! * 最適解以外も表示するときは Cha_cost_width を生かす ! */ Cha_cost_width = opt_show == 'b' ? -1 : cost_width0; } *************** *** 160,251 **** * 0 - ok * 1 - error */ ! int chasen_getopt_argv(char **argv, FILE *fp) { int c; ! /* read -r option */ Cha_optind = 0; while ((c = cha_getopt_chasen(argv, fp)) != EOF) { switch (c) { ! case 'r': ! /* chasenrc file */ cha_set_rcpath(Cha_optarg); break; ! case '?': return 1; } } ! /* initialize if not done */ if (!Cha_undef_info_num) ! cha_init(); ! /* read options */ Cha_optind = 0; while ((c = cha_getopt_chasen(argv, fp)) != EOF) { switch (c) { ! case 'b': ! case 'm': ! case 'p': opt_show = c; break; ! case 'd': ! case 'v': ! case 'f': ! case 'e': ! case 'c': opt_form = c; cha_set_opt_form(NULL); break; ! case 'F': ! cha_set_opt_form(cha_convert_escape(cha_strdup(Cha_optarg), 0)); break; ! case 'L': cha_set_language(Cha_optarg); break; ! case 'w': /* コスト幅の指定 */ cha_set_cost_width(atoi(Cha_optarg)); break; ! case 'O': ! Cha_output_compo = *Cha_optarg == 'c'; break; ! case 'l': cha_set_output(stdout); switch (*Cha_optarg) { ! case 'p': ! /* display the list of Cha_hinsi table */ cha_print_hinsi_table(); exit(0); break; ! case 't': cha_print_ctype_table(); exit(0); break; ! case 'f': cha_print_cform_table(); exit(0); break; ! default: break; } break; ! case 'j': opt_ja = 1; break; ! case 'B': opt_nobk = 1; break; ! case 'C': opt_cmd = 1; break; ! #if 0 /* not necessary */ ! case '?': return 1; #endif } } ! /* 最適解以外も表示するときは Cha_cost_width を生かす */ Cha_cost_width = opt_show == 'b' ? -1 : cost_width0; return 0; } ! /*********************************************************************** * command_usage() ! ***********************************************************************/ ! static void command_usage(void) { static char *message[] = { "commands are:\n", --- 167,283 ---- * 0 - ok * 1 - error */ ! int ! chasen_getopt_argv(char **argv, FILE * fp) { int c; ! /* ! * read -r option ! */ Cha_optind = 0; while ((c = cha_getopt_chasen(argv, fp)) != EOF) { switch (c) { ! case 'r': ! /* ! * chasenrc file ! */ cha_set_rcpath(Cha_optarg); break; ! case '?': ! return 1; } } ! /* ! * initialize if not done ! */ if (!Cha_undef_info_num) ! cha_init(); ! /* ! * read options ! */ Cha_optind = 0; while ((c = cha_getopt_chasen(argv, fp)) != EOF) { switch (c) { ! case 'b': ! case 'm': ! case 'p': ! opt_show = c; ! break; ! case 'd': ! case 'v': ! case 'f': ! case 'e': ! case 'c': opt_form = c; cha_set_opt_form(NULL); break; ! case 'F': ! cha_set_opt_form(cha_convert_escape ! (cha_strdup(Cha_optarg), 0)); break; ! case 'L': cha_set_language(Cha_optarg); break; ! case 'w': /* コスト幅の指定 */ cha_set_cost_width(atoi(Cha_optarg)); break; ! case 'O': ! Cha_output_iscompound = *Cha_optarg == 'c'; break; ! case 'l': cha_set_output(stdout); switch (*Cha_optarg) { ! case 'p': ! /* ! * display the list of Cha_hinsi table ! */ cha_print_hinsi_table(); exit(0); break; ! case 't': cha_print_ctype_table(); exit(0); break; ! case 'f': cha_print_cform_table(); exit(0); break; ! default: break; } break; ! case 'j': ! opt_ja = 1; ! break; ! case 'B': ! opt_nobk = 1; ! break; ! case 'C': ! opt_cmd = 1; ! break; ! #if 0 /* not necessary */ ! case '?': ! return 1; #endif } } ! /* ! * 最適解以外も表示するときは Cha_cost_width を生かす ! */ Cha_cost_width = opt_show == 'b' ? -1 : cost_width0; return 0; } ! /* * command_usage() ! */ ! static void ! command_usage(void) { static char *message[] = { "commands are:\n", *************** *** 264,375 **** char **mes; for (mes = message; *mes; mes++) ! fputs(*mes, stdout); } - #if 1 /* * chomp a string */ ! static void chomp(char *str) { int len; len = strlen(str); if (str[len - 1] == '\n') ! str[--len] = '\0'; if (str[len - 1] == '\r') ! str[--len] = '\0'; } - #endif ! /*********************************************************************** * chasen_command() * * return value: * 0 - succeed * 1 - quit chasen ! ***********************************************************************/ ! static int chasen_command(char *comm) { char *arg; int i; char *rslt[256]; /* 辞書引き結果変数 for 単語チェック(exact match) */ ! FILE *of; /* intファイルに書き込む(追加)ためのもの */ long new_word_index = 0; char tmpstr[2000]; arg = comm + 2; chomp(arg); ! switch(comm[0]) { /* command */ ! case 'V': cha_version(stdout); break; ! case 'F': cha_set_opt_form(cha_convert_escape(cha_strdup(arg), 0)); break; ! case 'L': cha_set_language(Cha_optarg); break; ! case 'w': ! /* cost width */ cha_set_cost_width(atoi(arg)); break; ! case 'i': ! /* information */ cha_version(stdout); ! printf("\ncost width: %d\n",Cha_cost_width); ! printf("weight of conn. cost: %d\n",Cha_con_cost_weight); ! printf("weight of morph cost: %d\n",Cha_mrph_cost_weight); printf("output format: \"%s\"\n", opt_form_string ? opt_form_string : "(none)"); printf("chasenrc file: %s\n", cha_get_rcpath()); ! printf("grammar file: %s\n",cha_get_grammar_dir()); printf("dic file:\n"); ! for(i = 0; patdic_filename[i][0]; i++) ! printf("\t%s\n",patdic_filename[i]); printf("dic file for processing:\n\t%s\n", patdic_filename[obj_dic_no]); break; ! case 'f': ! /* 処理対象となる辞書の変更 file name -> dic No. */ ! for(i = 0; patdic_filename[i][0]; i++){ ! printf("\t%s\n",patdic_filename[i]); ! if(strcmp(patdic_filename[i], arg) == 0){ ! obj_dic_no = i; /* 動的処理(追加)の対象となる辞書の番号 */ ! printf("dic number = %d\n",obj_dic_no); ! /* 書き込み禁止ならばエラーにしたい */ break; } } break; ! case 'a': ! /* パト木への単語の追加・挿入*/ ! if(strlen(arg) < 4){printf("invalid format\n");break;} ! /* 追加する単語をintファイルに追加 */ ! sprintf(tmpstr,"%s.int",patdic_filename[obj_dic_no]); ! of = cha_fopen(tmpstr,"a",1); fputs(arg, of); fputc(0, of); printf("add [%s] at %ld\n", arg, new_word_index); fclose(of); ! /* マップをやり直してもらうための処理 */ pat_text_reopen(Pat_dicfile[obj_dic_no], tmpstr); pat_insert(Pat_dicfile[obj_dic_no], arg, new_word_index); break; ! case 's': ! /* 木のセーブ */ ! sprintf(tmpstr,"%s.pat",patdic_filename[obj_dic_no]); pat_save(Pat_dicfile[obj_dic_no], tmpstr); break; ! case 'e': ! /* キーの検索 (exact match) */ ! for(i = 0; patdic_filename[i][0]; i++){ mrph2_t mrph; ! printf("DIC No. %d \"%s\"\n",i,patdic_filename[i]); pat_search_exact(Pat_dicfile[i], arg, rslt); ! if(!rslt[0]) printf("Not Found.\n"); else { char **pbuf; for (pbuf = rslt; *pbuf; pbuf++) { --- 296,429 ---- char **mes; for (mes = message; *mes; mes++) ! fputs(*mes, stdout); } /* * chomp a string */ ! static void ! chomp(char *str) { int len; len = strlen(str); if (str[len - 1] == '\n') ! str[--len] = '\0'; if (str[len - 1] == '\r') ! str[--len] = '\0'; } ! /* * chasen_command() * * return value: * 0 - succeed * 1 - quit chasen ! */ ! static int ! chasen_command(char *comm) { char *arg; int i; char *rslt[256]; /* 辞書引き結果変数 for 単語チェック(exact match) */ ! FILE *of; /* intファイルに書き込む(追加)ためのもの */ long new_word_index = 0; char tmpstr[2000]; arg = comm + 2; chomp(arg); ! switch (comm[0]) { /* command */ ! case 'V': cha_version(stdout); break; ! case 'F': cha_set_opt_form(cha_convert_escape(cha_strdup(arg), 0)); break; ! case 'L': cha_set_language(Cha_optarg); break; ! case 'w': ! /* ! * cost width ! */ cha_set_cost_width(atoi(arg)); break; ! case 'i': ! /* ! * information ! */ cha_version(stdout); ! printf("\ncost width: %d\n", Cha_cost_width); ! printf("weight of conn. cost: %d\n", Cha_con_cost_weight); ! printf("weight of morph cost: %d\n", Cha_mrph_cost_weight); printf("output format: \"%s\"\n", opt_form_string ? opt_form_string : "(none)"); printf("chasenrc file: %s\n", cha_get_rcpath()); ! printf("grammar file: %s\n", cha_get_grammar_dir()); printf("dic file:\n"); ! for (i = 0; patdic_filename[i][0]; i++) ! printf("\t%s\n", patdic_filename[i]); printf("dic file for processing:\n\t%s\n", patdic_filename[obj_dic_no]); break; ! case 'f': ! /* ! * 処理対象となる辞書の変更 file name -> dic No. ! */ ! for (i = 0; patdic_filename[i][0]; i++) { ! printf("\t%s\n", patdic_filename[i]); ! if (strcmp(patdic_filename[i], arg) == 0) { ! obj_dic_no = i; /* 動的処理(追加)の対象となる辞書の番号 */ ! printf("dic number = %d\n", obj_dic_no); ! /* ! * 書き込み禁止ならばエラーにしたい ! */ break; } } break; ! case 'a': ! /* ! * パト木への単語の追加・挿入 ! */ ! if (strlen(arg) < 4) { ! printf("invalid format\n"); ! break; ! } ! /* ! * 追加する単語をintファイルに追加 ! */ ! sprintf(tmpstr, "%s.int", patdic_filename[obj_dic_no]); ! of = cha_fopen(tmpstr, "a", 1); fputs(arg, of); fputc(0, of); printf("add [%s] at %ld\n", arg, new_word_index); fclose(of); ! /* ! * マップをやり直してもらうための処理 ! */ pat_text_reopen(Pat_dicfile[obj_dic_no], tmpstr); pat_insert(Pat_dicfile[obj_dic_no], arg, new_word_index); break; ! case 's': ! /* ! * 木のセーブ ! */ ! sprintf(tmpstr, "%s.pat", patdic_filename[obj_dic_no]); pat_save(Pat_dicfile[obj_dic_no], tmpstr); break; ! case 'e': ! /* ! * キーの検索 (exact match) ! */ ! for (i = 0; patdic_filename[i][0]; i++) { mrph2_t mrph; ! printf("DIC No. %d \"%s\"\n", i, patdic_filename[i]); pat_search_exact(Pat_dicfile[i], arg, rslt); ! if (!rslt[0]) ! printf("Not Found.\n"); else { char **pbuf; for (pbuf = rslt; *pbuf; pbuf++) { *************** *** 383,395 **** } } break; ! case 'q': /* quit */ return 1; ! case 'h': command_usage(); break; ! default: ! printf("invalid command: %s\n",comm); } fputs("ok\n", stdout); --- 437,449 ---- } } break; ! case 'q': /* quit */ return 1; ! case 'h': command_usage(); break; ! default: ! printf("invalid command: %s\n", comm); } fputs("ok\n", stdout); *************** *** 405,424 **** * 0 - ok / no result / too many morphs * 1 - quit */ ! static int chasen_sparse_main(char *input, FILE *output) { char *crlf; ! /* initialize if not done */ if (!Cha_undef_info_num) ! cha_init(); if (!opt_form_string) ! cha_set_opt_form(NULL); ! ! #if 0 ! /* 行末の改行コードを取り除く */ ! chomp(input); ! #endif cha_set_output(output); --- 459,476 ---- * 0 - ok / no result / too many morphs * 1 - quit */ ! static int ! chasen_sparse_main(char *input, FILE * output) { char *crlf; ! /* ! * initialize if not done ! */ if (!Cha_undef_info_num) ! cha_init(); if (!opt_form_string) ! cha_set_opt_form(NULL); cha_set_output(output); *************** *** 427,444 **** return 0; } ! /* コマンド・インタプリタ */ if (opt_cmd && *input == '#') ! return chasen_command(input + 1); ! ! /* conversion of ISO-2022-JP string to EUC-JP */ ! /* jis_to_euc(input);*/ ! /* parse a sentence and print */ while (*input) { int c = 0, len; if ((crlf = strpbrk(input, "\r\n")) == NULL) ! len = strlen(input); else { len = crlf - input; c = *crlf; --- 479,510 ---- return 0; } ! /* ! * initialize the tokenizer ! */ ! Cha_tokenizer = cha_tok_new(Cha_lang, Cha_encode); ! cha_tok_set_annotation(Cha_tokenizer, Cha_anno_info); ! ! /* ! * コマンド・インタプリタ ! */ if (opt_cmd && *input == '#') ! return chasen_command(input + 1); ! /* ! * conversion of ISO-2022-JP string to EUC-JP ! */ ! /* ! * jis_to_euc(input); ! */ ! ! /* ! * parse a sentence and print ! */ while (*input) { int c = 0, len; if ((crlf = strpbrk(input, "\r\n")) == NULL) ! len = strlen(input); else { len = crlf - input; c = *crlf; *************** *** 447,465 **** #ifdef SJIS sjis2euc(input); #endif if (len > 0 && !cha_parse_sentence(input, len, opt_nobk)) { cha_print_path(opt_show, opt_form, opt_form_string); } else if (!opt_ja) ! cha_print_bos_eos(opt_form); if (crlf == NULL) ! break; if (c == '\r' && crlf[1] == '\n') ! input = crlf + 2; else ! input = crlf + 1; } return 0; } --- 513,534 ---- #ifdef SJIS sjis2euc(input); #endif + cha_print_reset(); if (len > 0 && !cha_parse_sentence(input, len, opt_nobk)) { cha_print_path(opt_show, opt_form, opt_form_string); } else if (!opt_ja) ! cha_print_bos_eos(opt_form); if (crlf == NULL) ! break; if (c == '\r' && crlf[1] == '\n') ! input = crlf + 2; else ! input = crlf + 1; } + cha_tok_delete(Cha_tokenizer); + return 0; } *************** *** 473,491 **** /* * file -> file */ ! int chasen_fparse(FILE *fp_in, FILE *fp_out) { char line[CHA_INPUT_SIZE]; if (cha_fgets(line, sizeof(line), fp_in) == NULL) ! return 1; return chasen_sparse_main(line, fp_out); } /* * string -> file */ ! int chasen_sparse(char *str_in, FILE *fp_out) { int rc; char *euc_str; --- 542,562 ---- /* * file -> file */ ! int ! chasen_fparse(FILE * fp_in, FILE * fp_out) { char line[CHA_INPUT_SIZE]; if (cha_fgets(line, sizeof(line), fp_in) == NULL) ! return 1; return chasen_sparse_main(line, fp_out); } /* * string -> file */ ! int ! chasen_sparse(char *str_in, FILE * fp_out) { int rc; char *euc_str; *************** *** 509,523 **** /* * file -> string */ ! char *chasen_fparse_tostr(FILE *fp_in) { char line[CHA_INPUT_SIZE]; if (cha_fgets(line, sizeof(line), fp_in) == NULL) ! return NULL; if (chasen_sparse_main(line, NULL)) ! return NULL; return cha_get_output(); } --- 580,595 ---- /* * file -> string */ ! char * ! chasen_fparse_tostr(FILE * fp_in) { char line[CHA_INPUT_SIZE]; if (cha_fgets(line, sizeof(line), fp_in) == NULL) ! return NULL; if (chasen_sparse_main(line, NULL)) ! return NULL; return cha_get_output(); } *************** *** 525,531 **** /* * string -> string */ ! char *chasen_sparse_tostr(char *str_in) { char *euc_str; --- 597,604 ---- /* * string -> string */ ! char * ! chasen_sparse_tostr(char *str_in) { char *euc_str; *************** *** 533,585 **** cha_jistoeuc(str_in, euc_str); if (chasen_sparse_main(euc_str, NULL)) ! return NULL; free(euc_str); return cha_get_output(); } ! char *cha_fgets(char *s, int n, FILE *fp) { if (opt_ja) ! return cha_jfgets(s, n, fp); else ! return cha_fget_line(s, n, fp); } ! static void set_dic_filename(char *filename, char *s) { #ifdef PATHTYPE_MSDOS if (*s == PATH_DELIMITER || *s && s[1] == ':') ! strcpy(filename, s); #else if (*s == PATH_DELIMITER) ! strcpy(filename, s); ! #endif else ! sprintf(filename, "%s%s", cha_get_grammar_dir(), s); } /* * cha_read_patdic - read patricia dictionaries */ ! void cha_read_patdic(chasen_cell_t *cell) { ! int num; char patname[CHA_FILENAME_LEN]; char textname[CHA_FILENAME_LEN]; ! /* return if already read */ if (patdic_filename[0][0]) ! return; for (num = 0; !nullp(cell); num++, cell = cha_cdr(cell)) { if (num >= PAT_DIC_NUM) ! cha_exit_file(1, "too many patricia dictionary files"); set_dic_filename(patdic_filename[num], cha_s_atom(cha_car(cell))); ! /* open patdic */ sprintf(textname, "%s.int", patdic_filename[num]); sprintf(patname, "%s.pat", patdic_filename[num]); Pat_dicfile[num] = pat_open(textname, patname); --- 606,665 ---- cha_jistoeuc(str_in, euc_str); if (chasen_sparse_main(euc_str, NULL)) ! return NULL; free(euc_str); return cha_get_output(); } ! char * ! cha_fgets(char *s, int n, FILE * fp) { if (opt_ja) ! return cha_jfgets(s, n, fp); else ! return cha_fget_line(s, n, fp); } ! static void ! set_dic_filename(char *filename, char *s) { #ifdef PATHTYPE_MSDOS if (*s == PATH_DELIMITER || *s && s[1] == ':') ! strcpy(filename, s); #else if (*s == PATH_DELIMITER) ! strcpy(filename, s); ! #endif /* PATHTYPE_MSDOS */ else ! sprintf(filename, "%s%s", cha_get_grammar_dir(), s); } /* * cha_read_patdic - read patricia dictionaries */ ! void ! cha_read_patdic(chasen_cell_t * cell) { ! int num; char patname[CHA_FILENAME_LEN]; char textname[CHA_FILENAME_LEN]; ! /* ! * return if already read ! */ if (patdic_filename[0][0]) ! return; for (num = 0; !nullp(cell); num++, cell = cha_cdr(cell)) { if (num >= PAT_DIC_NUM) ! cha_exit_file(1, "too many patricia dictionary files"); set_dic_filename(patdic_filename[num], cha_s_atom(cha_car(cell))); ! /* ! * open patdic ! */ sprintf(textname, "%s.int", patdic_filename[num]); sprintf(patname, "%s.pat", patdic_filename[num]); Pat_dicfile[num] = pat_open(textname, patname); *************** *** 590,611 **** /* * cha_read_sufdic - read SUFARY dictionaries */ ! void cha_read_sufdic(chasen_cell_t *cell) { ! int num; char filename[CHA_FILENAME_LEN]; char ary_filename[CHA_FILENAME_LEN]; ! /* return if already read */ if (sufdic_filename[0][0]) ! return; for (num = 0; !nullp(cell); num++, cell = cha_cdr(cell)) { if (num >= PAT_DIC_NUM) ! cha_exit_file(1, "too many SUFARY dictionary files"); set_dic_filename(sufdic_filename[num], cha_s_atom(cha_car(cell))); ! /* open sufdic */ sprintf(filename, "%s.int", sufdic_filename[num]); sprintf(ary_filename, "%s.ary", sufdic_filename[num]); Suf_dicfile[num] = sa_openfiles(filename, ary_filename); --- 670,696 ---- /* * cha_read_sufdic - read SUFARY dictionaries */ ! void ! cha_read_sufdic(chasen_cell_t * cell) { ! int num; char filename[CHA_FILENAME_LEN]; char ary_filename[CHA_FILENAME_LEN]; ! /* ! * return if already read ! */ if (sufdic_filename[0][0]) ! return; for (num = 0; !nullp(cell); num++, cell = cha_cdr(cell)) { if (num >= PAT_DIC_NUM) ! cha_exit_file(1, "too many SUFARY dictionary files"); set_dic_filename(sufdic_filename[num], cha_s_atom(cha_car(cell))); ! /* ! * open sufdic ! */ sprintf(filename, "%s.int", sufdic_filename[num]); sprintf(ary_filename, "%s.ary", sufdic_filename[num]); Suf_dicfile[num] = sa_openfiles(filename, ary_filename); diff -crN chasen-2.2.3/lib/chalib.h chasen-2.2.4/lib/chalib.h *** chasen-2.2.3/lib/chalib.h Fri Feb 23 20:50:10 2001 --- chasen-2.2.4/lib/chalib.h Fri Mar 16 06:25:48 2001 *************** *** 1,3 **** --- 1,7 ---- + /* + * $Id: chalib.h,v 1.16 2001/03/15 21:25:48 masayu-a Exp $ + */ + #ifndef __CHALIB_H__ #define __CHALIB_H__ *************** *** 24,30 **** char *info; /* semantic information */ char *base; /* base form */ char *pron; /* Japanese pronunciation */ ! char *comp; /* compound words information */ short base_length; /* the length of stem */ unsigned short hinsi; /* POS number */ --- 28,34 ---- char *info; /* semantic information */ char *base; /* base form */ char *pron; /* Japanese pronunciation */ ! char *compound; /* compound words information */ short base_length; /* the length of stem */ unsigned short hinsi; /* POS number */ *************** *** 74,89 **** extern anno_info Cha_anno_info[UNDEF_HINSI_MAX]; extern undef_info Cha_undef_info[UNDEF_HINSI_MAX]; extern int Cha_undef_info_num; - extern int Cha_lang_j, Cha_lang_e; extern char *Cha_bos_string; extern char *Cha_eos_string; ! extern int Cha_output_compo; ! ! /* for encoding scheme */ ! #define CHA_ENCODE_EUC 1 ! #define CHA_ENCODE_ISO8859 2 ! #define CHA_ENCODE_UTF8 3 ! extern int Cha_encode; /* * functions --- 78,86 ---- extern anno_info Cha_anno_info[UNDEF_HINSI_MAX]; extern undef_info Cha_undef_info[UNDEF_HINSI_MAX]; extern int Cha_undef_info_num; extern char *Cha_bos_string; extern char *Cha_eos_string; ! extern int Cha_output_iscompound; /* * functions *************** *** 97,103 **** char *cha_get_output(void); void cha_set_fput(int); void cha_set_output(FILE*); ! void cha_set_sentence(char*, short*, char*); void cha_printf_mrph(int, mrph2_t*, char*); void cha_print_path(int, int, char*); void cha_print_bos_eos(int); --- 94,100 ---- char *cha_get_output(void); void cha_set_fput(int); void cha_set_output(FILE*); ! void cha_print_reset(void); void cha_printf_mrph(int, mrph2_t*, char*); void cha_print_path(int, int, char*); void cha_print_bos_eos(int); diff -crN chasen-2.2.3/lib/chasen.h chasen-2.2.4/lib/chasen.h *** chasen-2.2.3/lib/chasen.h Fri Feb 23 20:50:10 2001 --- chasen-2.2.4/lib/chasen.h Sat Feb 24 15:17:22 2001 *************** *** 1,9 **** /* * chasen.h - header file for ChaSen library */ ! #ifndef _CHASEN_H ! #define _CHASEN_H /* variables */ extern int Cha_optind; --- 1,11 ---- /* * chasen.h - header file for ChaSen library + * + * $Id: chasen.h,v 1.3 2001/02/24 06:17:22 kazuma-t Exp $ */ ! #ifndef __CHASEN_H__ ! #define __CHASEN_H__ /* variables */ extern int Cha_optind; *************** *** 15,18 **** extern char *chasen_fparse_tostr(FILE*); extern char *chasen_sparse_tostr(char*); ! #endif --- 17,20 ---- extern char *chasen_fparse_tostr(FILE*); extern char *chasen_sparse_tostr(char*); ! #endif /* __CHASEN_H__ */ diff -crN chasen-2.2.3/lib/chfile.c chasen-2.2.4/lib/chfile.c *** chasen-2.2.3/lib/chfile.c Fri Feb 23 20:50:10 2001 --- chasen-2.2.4/lib/chfile.c Sat Feb 24 15:17:22 2001 *************** *** 1,5 **** ! /*--------------------------------------------------------------* ! * * chfile.c - ファイルの開閉処理 * open/close suffix array files * SUFARY --- Suffix Array 検索のためのライブラリ --- 1,4 ---- ! /* * chfile.c - ファイルの開閉処理 * open/close suffix array files * SUFARY --- Suffix Array 検索のためのライブラリ *************** *** 37,45 **** * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ! * $Id: chfile.c,v 1.9 2001/02/23 11:50:10 kazuma-t Exp $ * ! *--------------------------------------------------------------*/ #include #include --- 36,44 ---- * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ! * $Id: chfile.c,v 1.10 2001/02/24 06:17:22 kazuma-t Exp $ * ! */ #include #include *************** *** 47,69 **** #include "chadic.h" #include "sufary.h" ! /* %%%%% 関数同士の関係 %%%%% ! sa_openfiles() !  +- sa_opentextfile() !  +- sa_openarrayfile() ! sa_cloasefiles() !  +- sa_closetextfile() !  +- sa_closearrayfile() ! %%%%%%%%%%%%%%%%%%%%%%%% */ ! ! static void sa_opentextfile(SUFARY*, char*); ! static void sa_openarrayfile(SUFARY*, char*); ! static void sa_closetextfile(SUFARY*); ! static void sa_closearrayfile(SUFARY*); ! /********************************************** ! * SUFARY *sa_openfiles(char *s, char *t); * * purpose * 指定されたテキストファイル(s)とarrayファイル(t)を開く。 --- 46,59 ---- #include "chadic.h" #include "sufary.h" ! static void sa_opentextfile(SUFARY *, char *); ! static void sa_openarrayfile(SUFARY *, char *); ! static void sa_closetextfile(SUFARY *); ! static void sa_closearrayfile(SUFARY *); ! /* ! * SUFARY *sa_openfiles(char *s, char *t); * * purpose * 指定されたテキストファイル(s)とarrayファイル(t)を開く。 *************** *** 76,83 **** * * return value * 作成されたSUFARY型変数 ! **********************************************/ ! SUFARY *sa_openfiles(char *s, char *t) { SUFARY *newary; char aryname[8192]; --- 66,74 ---- * * return value * 作成されたSUFARY型変数 ! */ ! SUFARY * ! sa_openfiles(char *s, char *t) { SUFARY *newary; char aryname[8192]; *************** *** 85,93 **** newary = cha_malloc(sizeof(SUFARY)); sa_opentextfile(newary, s); ! if (t == NULL){ ! /* アーギュメント 1つ : array file が指定されていない場合 */ ! sprintf(aryname, "%s.ary", s); /* (Rel1.4: .pat -> .ary) */ t = aryname; } sa_openarrayfile(newary, t); --- 76,83 ---- newary = cha_malloc(sizeof(SUFARY)); sa_opentextfile(newary, s); ! if (t == NULL) { /* text filename is not specified */ ! sprintf(aryname, "%s.ary", s); t = aryname; } sa_openarrayfile(newary, t); *************** *** 96,147 **** } ! /********************************************** ! * void sa_opentextfile(SUFARY *ary,char *s); ! * ! * purpose ! * テキストファイルだけを開く ! * ! * parameters ! * ary : ファイル情報(サイズなど)を格納するSUFARY型変数 ! * s : テキストファイル名 ! * ! **********************************************/ ! static void sa_opentextfile(SUFARY *ary, char *filename) { ! off_t size; /* ファイルサイズ */ void *map; ! ! /* 既にオープンされているものがあればクローズ */ ! if (ary->txtmap != NULL){ sa_closetextfile(ary); } - /* ファイルのオープン */ size = cha_mmap_file(filename, &map); ! ary->txtsz = size; ary->txtmap = map; } ! /********************************************** ! * void sa_openarrayfile(SUFARY *ary, char *s); ! * ! * purpose ! * Array ファイルを開く。 ! * ! * parameters ! * ary : ファイル情報(サイズなど)を格納するSUFARY型変数 ! * s : Array ファイル名 ! * ! **********************************************/ ! static void sa_openarrayfile(SUFARY *ary, char *filename) { ! off_t size; /* ファイルサイズ */ void *map; ! /* 既にオープンされているものがあればクローズ */ ! if (ary->arymap != NULL){ sa_closearrayfile(ary); } --- 86,126 ---- } ! /* ! * open text file ! */ ! static void ! sa_opentextfile(SUFARY * ary, char *filename) { ! off_t size; void *map; ! ! /* ! * 既にオープンされているものがあればクローズ ! */ ! if (ary->txtmap != NULL) { sa_closetextfile(ary); } size = cha_mmap_file(filename, &map); ! ary->txtsz = size; ary->txtmap = map; } ! /* ! * open array file ! */ ! static void ! sa_openarrayfile(SUFARY * ary, char *filename) { ! off_t size; void *map; ! /* ! * 既にオープンされているものがあればクローズ ! */ ! if (ary->arymap != NULL) { sa_closearrayfile(ary); } *************** *** 149,161 **** ary->arysz = size; ary->arraysize = size / sizeof(long); ary->arymap = map; ! /* left, right は検索範囲の内側を指す 980319 */ ary->left = 0; ary->right = ary->arraysize - 1; } ! /********************************************** ! * void sa_closefiles(SUFARY *ary); * * purpose * 指定されたファイルを閉じる --- 128,142 ---- ary->arysz = size; ary->arraysize = size / sizeof(long); ary->arymap = map; ! /* ! * left, right は検索範囲の内側を指す ! */ ary->left = 0; ary->right = ary->arraysize - 1; } ! /* ! * void sa_closefiles(SUFARY *ary); * * purpose * 指定されたファイルを閉じる *************** *** 168,225 **** * * description * テキストファイルとarrayファイルを同時に閉じる ! **********************************************/ ! void sa_closefiles(SUFARY *ary) { sa_closetextfile(ary); sa_closearrayfile(ary); cha_free(ary); } ! /********************************************** ! * void sa_closetextfile(SUFARY *ary); ! * ! * purpose ! * close the text file ! * 指定されたテキストファイルだけ閉じる ! * ! * parameters ! * ary : valuable (type SUFARY) for file which should be closed ! * (閉じたいファイルに関するSUFARY型変数) ! * ! * return value ! * none ! * なし ! **********************************************/ ! static void sa_closetextfile(SUFARY *ary) { ! if (ary->txtmap != NULL){ ! /* mmap の解放 */ cha_munmap_file(ary->txtmap, ary->txtsz); ary->txtmap = NULL; ary->txtsz = 0; } } ! /********************************************** ! * void sa_closearrayfile(void); ! * ! * purpose ! * close the array file ! * (指定されたarrayファイルだけ閉じるクローズ) ! * ! * parameters ! * ary : valuable (type SUFARY) for file which should be closed ! * (閉じたいファイルに関するSUFARY型変数) ! * ! * return value ! * none ! * なし ! **********************************************/ ! static void sa_closearrayfile(SUFARY *ary) { ! if (ary->arymap != NULL){ ! /* mmap の解放 */ cha_munmap_file(ary->arymap, ary->arysz); ary->arymap = NULL; ary->arysz = 0; --- 149,183 ---- * * description * テキストファイルとarrayファイルを同時に閉じる ! */ ! void ! sa_closefiles(SUFARY * ary) { sa_closetextfile(ary); sa_closearrayfile(ary); cha_free(ary); } ! /* ! * close text file ! */ ! static void ! sa_closetextfile(SUFARY * ary) { ! if (ary->txtmap != NULL) { cha_munmap_file(ary->txtmap, ary->txtsz); ary->txtmap = NULL; ary->txtsz = 0; } } ! /* ! * close array file ! */ ! static void ! sa_closearrayfile(SUFARY * ary) { ! if (ary->arymap != NULL) { cha_munmap_file(ary->arymap, ary->arysz); ary->arymap = NULL; ary->arysz = 0; diff -crN chasen-2.2.3/lib/connect.c chasen-2.2.4/lib/connect.c *** chasen-2.2.3/lib/connect.c Fri Feb 23 21:51:34 2001 --- chasen-2.2.4/lib/connect.c Sat Feb 24 15:17:22 2001 *************** *** 1,5 **** /* - * * connect.c - library for connection matrix * * Copyright (C) 2000, 2001, Nara Institute of Science and Technology --- 1,4 ---- *************** *** 34,112 **** * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * ! * 1990/12/17/Mon Yutaka MYOKI(Nagao Lab., KUEE) ! * $Id: connect.c,v 1.6 2001/02/23 12:51:34 kazuma-t Exp $ ! */ #include "chadic.h" #define CHA_LINEMAX 8192 ! static int tbl_num; /* 連接表のサイズ */ ! static int tbl_num_goi; /* 連接表の語彙の数 */ ! static int i_num; /* 連接行列の行 */ ! static int j_num; /* 連接行列の列 */ static rensetu_pair_t *rensetu_tbl; static connect_rule_t *connect_mtr; /* ! ------------------------------------------------------------------------------ ! rensetu table ! ------------------------------------------------------------------------------ ! */ ! static int cmp_pair(rensetu_pair_t *pair1, rensetu_pair_t *pair2) { int ret; ! ! /* 見出し語 */ /* surface form */ if (pair1->goi == NULL && pair2->goi != NULL) ! return 1; if (pair1->goi != NULL && pair2->goi == NULL) ! return -1; ! ! /* 品詞分類 */ /* POS */ if ((ret = pair1->hinsi - pair2->hinsi) != 0) ! return ret; ! /* 活用型*/ /* Conjugation type */ if ((ret = pair1->type - pair2->type) != 0) - return ret; - - /* 見出し語 */ /* surface form */ - if (pair1->goi != NULL && pair2->goi != NULL) - if ((ret = strcmp(pair1->goi, pair2->goi)) != 0) return ret; ! ! /* 活用形 */ /* Conjugation form */ if ((ret = pair1->form - pair2->form) != 0) ! return ret; ! return pair1->index - pair2->index; } ! static char *cha_numtok(char *s, int *valp) { int minus = 0; while (*s == ' ') ! s++; if (*s == '-') { minus = 1; s++; } if (*s < '0' || *s > '9') ! cha_exit_file(1, "illegal format"); ! for (*valp = 0; *s >= '0' && *s <= '9'; *valp = *valp * 10 + *s++ - '0'); while (*s == ' ') ! s++; if (minus) ! *valp = -*valp; return s; } ! void cha_read_table(FILE *fp_out, int dir) { FILE *fp; char *filepath; --- 33,123 ---- * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * ! * 1990/12/17/Mon Yutaka MYOKI(Nagao Lab., KUEE) ! * $Id: connect.c,v 1.7 2001/02/24 06:17:22 kazuma-t Exp $ ! */ #include "chadic.h" #define CHA_LINEMAX 8192 ! static int tbl_num; /* 連接表のサイズ */ ! static int tbl_num_goi; /* 連接表の語彙の数 */ ! static int i_num; /* 連接行列の行 */ ! static int j_num; /* 連接行列の列 */ static rensetu_pair_t *rensetu_tbl; static connect_rule_t *connect_mtr; /* ! * rensetu table ! */ ! static int ! cmp_pair(rensetu_pair_t * pair1, rensetu_pair_t * pair2) { int ret; ! ! /* ! * 見出し語 surface form ! */ if (pair1->goi == NULL && pair2->goi != NULL) ! return 1; if (pair1->goi != NULL && pair2->goi == NULL) ! return -1; ! ! /* ! * 品詞分類 POS ! */ if ((ret = pair1->hinsi - pair2->hinsi) != 0) ! return ret; ! /* ! * 活用型 Conjugation type ! */ if ((ret = pair1->type - pair2->type) != 0) return ret; ! ! /* ! * 見出し語 surface form ! */ ! if (pair1->goi != NULL && pair2->goi != NULL) ! if ((ret = strcmp(pair1->goi, pair2->goi)) != 0) ! return ret; ! ! /* ! * 活用形 Conjugation form ! */ if ((ret = pair1->form - pair2->form) != 0) ! return ret; ! return pair1->index - pair2->index; } ! static char * ! cha_numtok(char *s, int *valp) { int minus = 0; while (*s == ' ') ! s++; if (*s == '-') { minus = 1; s++; } if (*s < '0' || *s > '9') ! cha_exit_file(1, "illegal format"); ! for (*valp = 0; *s >= '0' && *s <= '9'; ! *valp = *valp * 10 + *s++ - '0'); while (*s == ' ') ! s++; if (minus) ! *valp = -*valp; return s; } ! void ! cha_read_table(FILE * fp_out, int dir) { FILE *fp; char *filepath; *************** *** 116,142 **** fp = cha_fopen_grammar(TABLE_FILE, "r", 1, dir, &filepath); if (fp_out != NULL) ! fprintf(fp_out, "parsing %s\n", filepath); Cha_lineno_error = ++Cha_lineno; fscanf(fp, "%d\n", &cell_num); rensetu_tbl = ! (rensetu_pair_t *)cha_malloc(sizeof(rensetu_pair_t) * cell_num); tbl_num = 0; for (i = 0; i < cell_num; i++) { Cha_lineno_error = ++Cha_lineno; if (fgets(buf, sizeof(buf), fp) == NULL) ! cha_exit_file(1, "illegal format"); Cha_lineno_error = ++Cha_lineno; if (fgets(s = buf, sizeof(buf), fp) == NULL) ! cha_exit_file(1, "illegal format"); s = cha_numtok(s, &val); rensetu_tbl[i].i_pos = val; s = cha_numtok(s, &val); rensetu_tbl[i].j_pos = val; if (!tbl_num && val < 0) ! tbl_num = i; buf[strlen(buf) - 1] = '\0'; if (*s >= '0' && *s <= '9') { s = cha_numtok(s, &val); --- 127,153 ---- fp = cha_fopen_grammar(TABLE_FILE, "r", 1, dir, &filepath); if (fp_out != NULL) ! fprintf(fp_out, "parsing %s\n", filepath); Cha_lineno_error = ++Cha_lineno; fscanf(fp, "%d\n", &cell_num); rensetu_tbl = ! (rensetu_pair_t *) cha_malloc(sizeof(rensetu_pair_t) * cell_num); tbl_num = 0; for (i = 0; i < cell_num; i++) { Cha_lineno_error = ++Cha_lineno; if (fgets(buf, sizeof(buf), fp) == NULL) ! cha_exit_file(1, "illegal format"); Cha_lineno_error = ++Cha_lineno; if (fgets(s = buf, sizeof(buf), fp) == NULL) ! cha_exit_file(1, "illegal format"); s = cha_numtok(s, &val); rensetu_tbl[i].i_pos = val; s = cha_numtok(s, &val); rensetu_tbl[i].j_pos = val; if (!tbl_num && val < 0) ! tbl_num = i; buf[strlen(buf) - 1] = '\0'; if (*s >= '0' && *s <= '9') { s = cha_numtok(s, &val); *************** *** 155,182 **** #endif tbl_num_goi++; } ! } else { /* for EDRdic '94.Mar */ rensetu_tbl[i].hinsi = UCHAR_MAX; rensetu_tbl[i].goi = cha_strdup(s); #ifdef SJIS ! sjis2euc(rensetu_tbl[i].goi); #endif } } if (!tbl_num) ! tbl_num = cell_num; fclose(fp); } ! /* for EDRdic '94.Mar */ ! void cha_check_edrtable(mrph_t *mrph_p, chasen_cell_t *x) { int i; for (i = 0; i < tbl_num; i++) { ! if ( rensetu_tbl[i].hinsi == UCHAR_MAX && strcmp(cha_s_atom(x), rensetu_tbl[i].goi) == 0) { mrph_p->con_tbl = i; return; --- 166,196 ---- #endif tbl_num_goi++; } ! } else { /* for EDRdic '94.Mar */ rensetu_tbl[i].hinsi = UCHAR_MAX; rensetu_tbl[i].goi = cha_strdup(s); #ifdef SJIS ! sjis2euc(rensetu_tbl[i].goi); #endif } } if (!tbl_num) ! tbl_num = cell_num; fclose(fp); } ! /* ! * for EDRdic '94.Mar ! */ ! void ! cha_check_edrtable(mrph_t * mrph_p, chasen_cell_t * x) { int i; for (i = 0; i < tbl_num; i++) { ! if (rensetu_tbl[i].hinsi == UCHAR_MAX && strcmp(cha_s_atom(x), rensetu_tbl[i].goi) == 0) { mrph_p->con_tbl = i; return; *************** *** 185,197 **** cha_exit_file(1, "no morpheme in EDR table"); } ! /* for EDRdic '94.Mar */ ! void cha_check_edrtable_str(mrph_t *mrph_p, char *str) { int i; ! for (i = 0; i < tbl_num; i++) { ! if ( rensetu_tbl[i].hinsi == UCHAR_MAX && strcmp(str, rensetu_tbl[i].goi) == 0) { mrph_p->con_tbl = i; return; --- 199,214 ---- cha_exit_file(1, "no morpheme in EDR table"); } ! /* ! * for EDRdic '94.Mar ! */ ! void ! cha_check_edrtable_str(mrph_t * mrph_p, char *str) { int i; ! for (i = 0; i < tbl_num; i++) { ! if (rensetu_tbl[i].hinsi == UCHAR_MAX && strcmp(str, rensetu_tbl[i].goi) == 0) { mrph_p->con_tbl = i; return; *************** *** 200,271 **** cha_exit_file(1, "no morpheme in EDR table"); } ! static int find_table(mrph_t *mrph, rensetu_pair_t *pair) { int ret; ! /* 品詞分類 */ /* POS */ if ((ret = mrph->hinsi - pair->hinsi) != 0) ! return ret; ! /* 活用型 */ /* Conjugation type */ if ((ret = mrph->ktype - pair->type) != 0) ! return ret; ! /* 見出し語 */ /* surface form */ ! if (pair->goi && (ret = strcmp(mrph->midasi,pair->goi))) ! return ret; ! /* 活用語ならば、活用形の1番とマッチ*/ if (mrph->ktype) ! return 1 - pair->form; return 0; } ! int cha_check_table(mrph_t *mrph) /* 970301 tatuo: void -> int for 頑健化 */ { rensetu_pair_t *ret; if (rensetu_tbl[0].hinsi == 0) ! qsort(rensetu_tbl, tbl_num, sizeof(rensetu_pair_t), (int (*)())cmp_pair); ret = (rensetu_pair_t *) ! bsearch(mrph, rensetu_tbl, tbl_num_goi, ! sizeof(rensetu_pair_t), (int (*)())find_table); ! if (ret){ mrph->con_tbl = ret->index; ! return 1; /* 970301 tatuo: 問題ない時は 1 を返す */ } ret = (rensetu_pair_t *) ! bsearch(mrph, rensetu_tbl + tbl_num_goi, tbl_num - tbl_num_goi, ! sizeof(rensetu_pair_t), (int (*)())find_table); ! if (ret){ mrph->con_tbl = ret->index; ! return 1; /* 970301 tatuo: 問題ない時は 1 を返す */ } ! cha_exit_file(-1, "no morpheme in connection table\n"); /* 970301 tatuo: 1 -> -1 */ ! return 0; /* 970301 tatuo: 問題ある時は 0 を返す */ } ! int cha_check_table_for_undef(int hinsi) { ! int i; ! for (i = 0; i < tbl_num; i++) ! if (rensetu_tbl[i].hinsi == hinsi) ! if (!rensetu_tbl[i].goi) ! return i; ! return -1; } /* ! ------------------------------------------------------------------------------ ! rensetu matrix ! ------------------------------------------------------------------------------ ! */ ! void cha_read_matrix(FILE *fp_out) { FILE *fp; char *filepath; --- 217,303 ---- cha_exit_file(1, "no morpheme in EDR table"); } ! static int ! find_table(mrph_t * mrph, rensetu_pair_t * pair) { int ret; ! /* ! * 品詞分類 POS ! */ if ((ret = mrph->hinsi - pair->hinsi) != 0) ! return ret; ! /* ! * 活用型 Conjugation type ! */ if ((ret = mrph->ktype - pair->type) != 0) ! return ret; ! /* ! * 見出し語 surface form ! */ ! if (pair->goi && (ret = strcmp(mrph->midasi, pair->goi))) ! return ret; ! /* ! * 活用語ならば、活用形の1番とマッチ ! */ if (mrph->ktype) ! return 1 - pair->form; return 0; } ! /* if an error occurs, this function returns 0, else returns 1 */ ! int ! cha_check_table(mrph_t * mrph) { rensetu_pair_t *ret; if (rensetu_tbl[0].hinsi == 0) ! qsort(rensetu_tbl, tbl_num, sizeof(rensetu_pair_t), ! (int (*)()) cmp_pair); ret = (rensetu_pair_t *) ! bsearch(mrph, rensetu_tbl, tbl_num_goi, ! sizeof(rensetu_pair_t), (int (*)()) find_table); ! if (ret) { mrph->con_tbl = ret->index; ! return 1; } ret = (rensetu_pair_t *) ! bsearch(mrph, rensetu_tbl + tbl_num_goi, tbl_num - tbl_num_goi, ! sizeof(rensetu_pair_t), (int (*)()) find_table); ! if (ret) { mrph->con_tbl = ret->index; ! return 1; /* if no error, return 1 */ } ! /* ! * error ! */ ! cha_exit_file(-1, "no morpheme in connection table\n"); ! return 0; } ! int ! cha_check_table_for_undef(int hinsi) { ! int i; ! for (i = 0; i < tbl_num; i++) ! if (rensetu_tbl[i].hinsi == hinsi) ! if (!rensetu_tbl[i].goi) ! return i; ! return -1; } /* ! * rensetu matrix ! */ ! void ! cha_read_matrix(FILE * fp_out) { FILE *fp; char *filepath; *************** *** 275,293 **** fp = cha_fopen_grammar(MATRIX_FILE, "r", 1, 1, &filepath); if (fp_out != NULL) ! fprintf(fp_out, "parsing %s", filepath); Cha_lineno_error = ++Cha_lineno; fscanf(fp, "%d %d\n", &i_num, &j_num); connect_mtr = (connect_rule_t *) ! cha_malloc(sizeof(connect_rule_t) * i_num * j_num); next = 0; for (i = 0; i < i_num; i++) { Cha_lineno_error = ++Cha_lineno; if (fgets(s = buf, sizeof(buf), fp) == NULL) ! cha_exit_file(1, "illegal format"); ! for (j = 0; j < j_num; ) { int nval; if (*s == 'o') { s = cha_numtok(s + 1, &nval); --- 307,325 ---- fp = cha_fopen_grammar(MATRIX_FILE, "r", 1, 1, &filepath); if (fp_out != NULL) ! fprintf(fp_out, "parsing %s", filepath); Cha_lineno_error = ++Cha_lineno; fscanf(fp, "%d %d\n", &i_num, &j_num); connect_mtr = (connect_rule_t *) ! cha_malloc(sizeof(connect_rule_t) * i_num * j_num); next = 0; for (i = 0; i < i_num; i++) { Cha_lineno_error = ++Cha_lineno; if (fgets(s = buf, sizeof(buf), fp) == NULL) ! cha_exit_file(1, "illegal format"); ! for (j = 0; j < j_num;) { int nval; if (*s == 'o') { s = cha_numtok(s + 1, &nval); *************** *** 295,306 **** } else { s = cha_numtok(s, &next); if (*s++ != ',') ! cha_exit_file(1, "illegal format"); s = cha_numtok(s, &cost); if (*s == 'x') ! s = cha_numtok(s + 1, &nval); else ! nval = 1; } while (nval-- > 0) { connect_mtr[i * j_num + j].next = next; --- 327,338 ---- } else { s = cha_numtok(s, &next); if (*s++ != ',') ! cha_exit_file(1, "illegal format"); s = cha_numtok(s, &cost); if (*s == 'x') ! s = cha_numtok(s + 1, &nval); else ! nval = 1; } while (nval-- > 0) { connect_mtr[i * j_num + j].next = next; *************** *** 312,332 **** fclose(fp); } ! int cha_check_automaton(int state, int con, int undef_con_cost, int *costp) { connect_rule_t *cr; #if 0 ! printf("[%d:%d:%d]\n",state,con,rensetu_tbl[con].j_pos);fflush(stdout); #endif cr = &connect_mtr[state * j_num + rensetu_tbl[con].j_pos]; *costp = cr->cost; if (*costp == 0) ! *costp = undef_con_cost; else ! (*costp)--; #ifdef DEBUG ! printf("[state:%d,con:%d,newcon:%d] ",state,con,cr->next+con); #endif return rensetu_tbl[cr->next + con].i_pos; --- 344,366 ---- fclose(fp); } ! int ! cha_check_automaton(int state, int con, int undef_con_cost, int *costp) { connect_rule_t *cr; #if 0 ! printf("[%d:%d:%d]\n", state, con, rensetu_tbl[con].j_pos); ! fflush(stdout); #endif cr = &connect_mtr[state * j_num + rensetu_tbl[con].j_pos]; *costp = cr->cost; if (*costp == 0) ! *costp = undef_con_cost; else ! (*costp)--; #ifdef DEBUG ! printf("[state:%d,con:%d,newcon:%d] ", state, con, cr->next + con); #endif return rensetu_tbl[cr->next + con].i_pos; diff -crN chasen-2.2.3/lib/dic.c chasen-2.2.4/lib/dic.c *** chasen-2.2.3/lib/dic.c Wed Feb 14 09:20:52 2001 --- chasen-2.2.4/lib/dic.c Fri Mar 16 06:25:48 2001 *************** *** 1,5 **** /* ! * dic.c -- library for dictionary * * Copyright (C) 2000, 2001, Nara Institute of Science and Technology * --- 1,5 ---- /* ! * dic.c -- library for parsing dictionary * * Copyright (C) 2000, 2001, Nara Institute of Science and Technology * *************** *** 33,89 **** * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * ! * $Id: dic.c,v 1.3 2001/02/14 00:20:52 masayu-a Exp $ */ #include "chalib.h" ! /* 辞書ファイル名 */ ! SUFARY *Suf_dicfile[PAT_DIC_NUM]; /* dictionary name (SUFDIC) */ ! pat_t *Pat_dicfile[PAT_DIC_NUM]; /* dictionary name (PATDIC) */ int Suf_ndicfile = 0; int Pat_ndicfile = 0; ! void cha_get_mrph_data(mrph2_t *mrph, char *pbuf, char *target) { unsigned char *p = pbuf; ! mrph->midasi = target; /* surfacr form */ ! mrph->is_undef = 0; /* unseen word or not */ ! /* 見出し語の長さ */ /* the length of surface form */ ! while (*p++) ! ; ! mrph->base_length = mrph->length = (char *)p - pbuf - 1; ! /* 読み */ /* Japanese reading */ mrph->yomi = p; ! while (*p++) ! ; ! /* 発音 */ /* Japanese pronunciation */ mrph->pron = p; ! while (*p++) ! ; ! /* 原形 */ /* base form */ mrph->base = p; ! while (*p++) ! ; ! /* 意味 */ /* semantic information */ mrph->info = p; ! while (*p++) ! ; ! /* 品詞大分類 No. */ /* POS number */ ! mrph->hinsi = (p[0]-CHAINT_OFFSET)*CHAINT_SCALE + p[1]-CHAINT_OFFSET; p += 2; ! /* 活用型 No. */ /* Conjugation type */ mrph->ktype = *p++ - CHAINT_OFFSET; ! /* 活用形 No. */ /* Conjugation form */ mrph->kform = *p++ - CHAINT_OFFSET; ! /* 重み */ /* cost for morpheme */ ! mrph->weight = (p[0]-CHAINT_OFFSET)*CHAINT_SCALE + p[1]-CHAINT_OFFSET; p += 2; ! /* 接続テーブル番号 */ /* the number for connection matrix */ ! mrph->con_tbl = (p[0]-CHAINT_OFFSET)*CHAINT_SCALE + p[1]-CHAINT_OFFSET; p += 2; ! /* 複合語 */ /* compound words information */ ! mrph->comp = p; } --- 33,109 ---- * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * ! * $Id: dic.c,v 1.5 2001/03/15 21:25:48 masayu-a Exp $ */ #include "chalib.h" ! SUFARY *Suf_dicfile[PAT_DIC_NUM]; /* dictionary (SUFDIC) */ ! pat_t *Pat_dicfile[PAT_DIC_NUM]; /* dictionary (PATDIC) */ int Suf_ndicfile = 0; int Pat_ndicfile = 0; ! void ! cha_get_mrph_data(mrph2_t * mrph, char *pbuf, char *target) { unsigned char *p = pbuf; ! mrph->midasi = target; /* surface form */ ! mrph->is_undef = 0; /* unseen word or not */ ! /* ! * 見出し語の長さ the length of surface form ! */ ! while (*p++); ! mrph->base_length = mrph->length = (char *) p - pbuf - 1; ! /* ! * 読み Japanese reading ! */ mrph->yomi = p; ! while (*p++); ! /* ! * 発音 Japanese pronunciation ! */ mrph->pron = p; ! while (*p++); ! /* ! * 原形 infinitive form ! */ mrph->base = p; ! while (*p++); ! /* ! * 意味 semantic information ! */ mrph->info = p; ! while (*p++); ! /* ! * 品詞大分類 POS number ! */ ! mrph->hinsi = ! (p[0] - CHAINT_OFFSET) * CHAINT_SCALE + p[1] - CHAINT_OFFSET; p += 2; ! /* ! * 活用型 Conjugation type ! */ mrph->ktype = *p++ - CHAINT_OFFSET; ! /* ! * 活用形 Conjugation form ! */ mrph->kform = *p++ - CHAINT_OFFSET; ! /* ! * 重み cost for morpheme ! */ ! mrph->weight = ! (p[0] - CHAINT_OFFSET) * CHAINT_SCALE + p[1] - CHAINT_OFFSET; p += 2; ! /* ! * 接続テーブル番号 the number for connection matrix ! */ ! mrph->con_tbl = ! (p[0] - CHAINT_OFFSET) * CHAINT_SCALE + p[1] - CHAINT_OFFSET; p += 2; ! /* ! * 複合語 compound words information ! */ ! mrph->compound = p; } diff -crN chasen-2.2.3/lib/getid.c chasen-2.2.4/lib/getid.c *** chasen-2.2.3/lib/getid.c Fri Feb 23 20:50:10 2001 --- chasen-2.2.4/lib/getid.c Sat Feb 24 15:17:22 2001 *************** *** 33,62 **** * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * ! * $Id: getid.c,v 1.5 2001/02/23 11:50:10 kazuma-t Exp $ */ #include "chadic.h" ! /* get POS str id */ ! int cha_get_nhinsi_str_id(char **hinsi) { int id, i, d; if (!*hinsi) ! cha_exit_file(1, "an empty string for POS"); for (id = 0; *hinsi; hinsi++) { if (!**hinsi) ! cha_exit_file(1, "an empty string for POS"); for (i = 0; (d = Cha_hinsi[id].daughter[i]) != 0; i++) { if (!strcmp(Cha_hinsi[d].name, *hinsi)) ! break; } ! if (!d){ cha_exit_file(1, "POS `%s' is undefined", *hinsi); - /* return 0; 970301 tatuo: 問題ある時は 0 を返す */ } id = d; } --- 33,65 ---- * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * ! * $Id: getid.c,v 1.6 2001/02/24 06:17:22 kazuma-t Exp $ */ #include "chadic.h" ! /* ! * get POS str id ! * ! */ ! int ! cha_get_nhinsi_str_id(char **hinsi) { int id, i, d; if (!*hinsi) ! cha_exit_file(1, "an empty string for POS"); for (id = 0; *hinsi; hinsi++) { if (!**hinsi) ! cha_exit_file(1, "an empty string for POS"); for (i = 0; (d = Cha_hinsi[id].daughter[i]) != 0; i++) { if (!strcmp(Cha_hinsi[d].name, *hinsi)) ! break; } ! if (!d) { cha_exit_file(1, "POS `%s' is undefined", *hinsi); } id = d; } *************** *** 64,108 **** return id; } ! /* get POS id */ ! int cha_get_nhinsi_id(chasen_cell_t *cell) { char *hinsi_str[256]; char **hinsi = hinsi_str; for (; !nullp(cell); cell = cha_cdr(cell)) ! *hinsi++ = cha_s_atom(cha_car(cell)); *hinsi = NULL; return cha_get_nhinsi_str_id(hinsi_str); } ! /* get ctype id */ ! int cha_get_type_id(char *x) { int i; ! if (x==NULL) { cha_exit_file(1, "null string for type"); return 0; } if (x[0] == '*' && x[1] == '\0') ! return 0; ! for (i = 1; strcmp(Cha_type[i].name, x); ) { if (!Cha_type[++i].name) { cha_exit_file(1, "type `%s' is undefined", x); - /* return 255; 970301 tatuo: 問題ある時は 255 を返す(暫定) */ } } return i; } ! /* get cform id */ ! int cha_get_form_id(char *x, int type) { int i; --- 67,119 ---- return id; } ! /* ! * get POS id ! */ ! int ! cha_get_nhinsi_id(chasen_cell_t * cell) { char *hinsi_str[256]; char **hinsi = hinsi_str; for (; !nullp(cell); cell = cha_cdr(cell)) ! *hinsi++ = cha_s_atom(cha_car(cell)); *hinsi = NULL; return cha_get_nhinsi_str_id(hinsi_str); } ! /* ! * get ctype id ! */ ! int ! cha_get_type_id(char *x) { int i; ! if (x == NULL) { cha_exit_file(1, "null string for type"); return 0; } if (x[0] == '*' && x[1] == '\0') ! return 0; ! for (i = 1; strcmp(Cha_type[i].name, x);) { if (!Cha_type[++i].name) { cha_exit_file(1, "type `%s' is undefined", x); } } return i; } ! /* ! * get cform id ! */ ! int ! cha_get_form_id(char *x, int type) { int i; *************** *** 112,127 **** } if (x[0] == '*' && x[1] == '\0') ! return 0; if (type == 0) { cha_exit_file(1, "Invalid type number for type `%s'", x); return 0; } ! for (i = 1; strcmp(Cha_form[type][i].name, x); ) { if (!Cha_form[type][++i].name) { ! cha_exit_file(1, "type `%s' has no conjugation `%s'", Cha_type[type].name, x); return 0; } } --- 123,139 ---- } if (x[0] == '*' && x[1] == '\0') ! return 0; if (type == 0) { cha_exit_file(1, "Invalid type number for type `%s'", x); return 0; } ! for (i = 1; strcmp(Cha_form[type][i].name, x);) { if (!Cha_form[type][++i].name) { ! cha_exit_file(1, "type `%s' has no conjugation `%s'", ! Cha_type[type].name, x); return 0; } } diff -crN chasen-2.2.3/lib/getopt.c chasen-2.2.4/lib/getopt.c *** chasen-2.2.3/lib/getopt.c Fri Feb 23 20:50:10 2001 --- chasen-2.2.4/lib/getopt.c Sat Feb 24 15:17:22 2001 *************** *** 35,41 **** * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * * Author: A.Kitauchi , Apr 1997 ! * $Id: getopt.c,v 1.3 2001/02/23 11:50:10 kazuma-t Exp $ */ #include --- 35,41 ---- * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * * Author: A.Kitauchi , Apr 1997 ! * $Id: getopt.c,v 1.4 2001/02/24 06:17:22 kazuma-t Exp $ */ #include *************** *** 44,55 **** int Cha_optind = 0; char *Cha_optarg; ! int cha_getopt(char **argv, char *optstring, FILE *fp) { static char *nextchar; char *op, c; ! /* initialization */ if (Cha_optind == 0) { Cha_optind = 1; nextchar = argv[1]; --- 44,58 ---- int Cha_optind = 0; char *Cha_optarg; ! int ! cha_getopt(char **argv, char *optstring, FILE * fp) { static char *nextchar; char *op, c; ! /* ! * initialization ! */ if (Cha_optind == 0) { Cha_optind = 1; nextchar = argv[1]; *************** *** 57,98 **** Cha_optarg = NULL; if (nextchar == argv[Cha_optind]) { ! /* no option */ if (nextchar == NULL || nextchar[0] != '-' || nextchar[1] == '\0') ! return EOF; ! /* '--' option */ if (*++nextchar == '-') { nextchar = argv[++Cha_optind]; return EOF; } } ! /* find out an option letter */ c = *nextchar++; if ((op = strchr(optstring, c)) == NULL || c == ':') { if (fp != NULL) ! fprintf(fp, "%s: invalid option -- %c\n", argv[0], c); c = '?'; } ! /* option with an argument */ else if (op[1] == ':') { ! /* next character */ if (*nextchar) ! Cha_optarg = nextchar; ! /* next argv */ else if (argv[Cha_optind + 1] != NULL) ! Cha_optarg = argv[++Cha_optind]; ! /* no argument */ else { if (fp != NULL) ! fprintf(fp, "%s: option requires an argument -- %c\n", argv[0], c); c = '?'; } nextchar = argv[++Cha_optind]; } if (nextchar != NULL && *nextchar == '\0') ! nextchar = argv[++Cha_optind]; return c; } --- 60,116 ---- Cha_optarg = NULL; if (nextchar == argv[Cha_optind]) { ! /* ! * no option ! */ if (nextchar == NULL || nextchar[0] != '-' || nextchar[1] == '\0') ! return EOF; ! /* ! * '--' option ! */ if (*++nextchar == '-') { nextchar = argv[++Cha_optind]; return EOF; } } ! /* ! * find out an option letter ! */ c = *nextchar++; if ((op = strchr(optstring, c)) == NULL || c == ':') { if (fp != NULL) ! fprintf(fp, "%s: invalid option -- %c\n", argv[0], c); c = '?'; } ! /* ! * option with an argument ! */ else if (op[1] == ':') { ! /* ! * next character ! */ if (*nextchar) ! Cha_optarg = nextchar; ! /* ! * next argv ! */ else if (argv[Cha_optind + 1] != NULL) ! Cha_optarg = argv[++Cha_optind]; ! /* ! * no argument ! */ else { if (fp != NULL) ! fprintf(fp, "%s: option requires an argument -- %c\n", ! argv[0], c); c = '?'; } nextchar = argv[++Cha_optind]; } if (nextchar != NULL && *nextchar == '\0') ! nextchar = argv[++Cha_optind]; return c; } *************** *** 100,106 **** /* * chasen_getopt */ ! int cha_getopt_chasen(char **argv, FILE *fp) { return cha_getopt(argv, "sP:D:RabmpdvfecMo:F:L:l:jr:w:O:BChV", fp); } --- 118,125 ---- /* * chasen_getopt */ ! int ! cha_getopt_chasen(char **argv, FILE * fp) { return cha_getopt(argv, "sP:D:RabmpdvfecMo:F:L:l:jr:w:O:BChV", fp); } *************** *** 108,152 **** #ifdef TEST ! int main (int argc, char *argv[]) { int c; ! while (1) ! { ! c = cha_getopt (argv, "abc:d:", stderr); ! if (c == EOF) break; - switch (c) - { - case 'a': - printf ("option a\n"); - break; - - case 'b': - printf ("option b\n"); - break; - - case 'c': - printf ("option c with value `%s'\n", Cha_optarg); - break; - - case '?': - break; - - default: - printf ("?? getopt returned character code 0%o ??\n", c); - } - } - - if (Cha_optind < argc) - { - printf ("non-option ARGV-elements: "); - while (Cha_optind < argc) - printf ("%s ", argv[Cha_optind++]); - printf ("\n"); - } ! exit (0); } #endif /* TEST */ --- 127,169 ---- #ifdef TEST ! int ! main(int argc, char *argv[]) { int c; ! while (1) { ! c = cha_getopt(argv, "abc:d:", stderr); ! if (c == EOF) ! break; ! switch (c) { ! case 'a': ! printf("option a\n"); ! break; ! ! case 'b': ! printf("option b\n"); ! break; ! ! case 'c': ! printf("option c with value `%s'\n", Cha_optarg); break; ! case '?': ! break; ! ! default: ! printf("?? getopt returned character code 0%o ??\n", c); ! } ! } ! ! if (Cha_optind < argc) { ! printf("non-option ARGV-elements: "); ! while (Cha_optind < argc) ! printf("%s ", argv[Cha_optind++]); ! printf("\n"); ! } ! ! exit(0); } #endif /* TEST */ diff -crN chasen-2.2.3/lib/grammar.c chasen-2.2.4/lib/grammar.c *** chasen-2.2.3/lib/grammar.c Fri Feb 23 21:51:34 2001 --- chasen-2.2.4/lib/grammar.c Sat Feb 24 15:17:22 2001 *************** *** 34,61 **** * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * ! * 1990/11/14/Wed Yutaka MYOKI(Nagao Lab., KUEE) * modified by A.Kitauchi , Nov. 1996 ! * $Id: grammar.c,v 1.6 2001/02/23 12:51:34 kazuma-t Exp $ */ #include "chadic.h" hinsi_t Cha_hinsi[HINSI_MAX]; ! /*********************************************************************** * make_hinsi ! ***********************************************************************/ ! static int make_hinsi(chasen_cell_t *cell, int mother, int idx) { char *name, *s; int depth, i, d; short *path; if (idx >= HINSI_MAX) ! cha_exit_file(1, "too many (over %d) parts of speech", HINSI_MAX); ! /* path */ depth = Cha_hinsi[mother].depth + 1; path = cha_malloc(sizeof(short) * (depth + 1)); memcpy(path, Cha_hinsi[mother].path, sizeof(short) * depth); --- 34,64 ---- * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * ! * 1990/11/14/Wed Yutaka MYOKI(Nagao Lab., KUEE) * modified by A.Kitauchi , Nov. 1996 ! * $Id: grammar.c,v 1.7 2001/02/24 06:17:22 kazuma-t Exp $ */ #include "chadic.h" hinsi_t Cha_hinsi[HINSI_MAX]; ! /* * make_hinsi ! */ ! static int ! make_hinsi(chasen_cell_t * cell, int mother, int idx) { char *name, *s; int depth, i, d; short *path; if (idx >= HINSI_MAX) ! cha_exit_file(1, "too many (over %d) parts of speech", HINSI_MAX); ! /* ! * path ! */ depth = Cha_hinsi[mother].depth + 1; path = cha_malloc(sizeof(short) * (depth + 1)); memcpy(path, Cha_hinsi[mother].path, sizeof(short) * depth); *************** *** 64,101 **** Cha_hinsi[idx].depth = depth; Cha_hinsi[idx].path = path; ! /* hinsi name and katsuyou */ name = cha_s_atom(cha_car(cell)); #if 0 ! printf("%2d:%*s%s\n",depth,depth*2,"",name); fflush(stdout); #endif ! /* 品詞の二重登録のチェック あまりきれいな方法ではない */ for (i = 0; Cha_hinsi[mother].daughter[i + 1]; i++) { if (!strcmp(Cha_hinsi[Cha_hinsi[mother].daughter[i]].name, name)) ! cha_exit_file(1, "hinsi `%s' is already defined", name); } s = name + strlen(name) - 1; if (Cha_hinsi[mother].kt == 1 || *s == '%') { Cha_hinsi[idx].kt = 1; if (*s == '%') ! *s = '\0'; } if ((s = strchr(name, '/')) != NULL) { *s++ = '\0'; Cha_hinsi[idx].bkugiri = *s ? cha_strdup(s) : JSTR_BKUGIRI; } else if (Cha_hinsi[mother].bkugiri) ! Cha_hinsi[idx].bkugiri = Cha_hinsi[mother].bkugiri; if (*name == '\0') ! cha_exit_file(1, "an empty string for hinsi name"); Cha_hinsi[idx].name = cha_strdup(name); #if 0 cha_s_print(stdout, cha_car(cell)); ! printf("[%d,%d,%s]\n",mother,idx,name); fflush(stdout); #endif --- 67,108 ---- Cha_hinsi[idx].depth = depth; Cha_hinsi[idx].path = path; ! /* ! * hinsi name and katsuyou ! */ name = cha_s_atom(cha_car(cell)); #if 0 ! printf("%2d:%*s%s\n", depth, depth * 2, "", name); fflush(stdout); #endif ! /* ! * 品詞の二重登録のチェック あまりきれいな方法ではない ! */ for (i = 0; Cha_hinsi[mother].daughter[i + 1]; i++) { if (!strcmp(Cha_hinsi[Cha_hinsi[mother].daughter[i]].name, name)) ! cha_exit_file(1, "hinsi `%s' is already defined", name); } s = name + strlen(name) - 1; if (Cha_hinsi[mother].kt == 1 || *s == '%') { Cha_hinsi[idx].kt = 1; if (*s == '%') ! *s = '\0'; } if ((s = strchr(name, '/')) != NULL) { *s++ = '\0'; Cha_hinsi[idx].bkugiri = *s ? cha_strdup(s) : JSTR_BKUGIRI; } else if (Cha_hinsi[mother].bkugiri) ! Cha_hinsi[idx].bkugiri = Cha_hinsi[mother].bkugiri; if (*name == '\0') ! cha_exit_file(1, "an empty string for hinsi name"); Cha_hinsi[idx].name = cha_strdup(name); #if 0 cha_s_print(stdout, cha_car(cell)); ! printf("[%d,%d,%s]\n", mother, idx, name); fflush(stdout); #endif *************** *** 107,113 **** short daughter[256]; int ndaughter = 0; d = idx + 1; ! /* 品詞の二重登録のチェックのため一時的に daughter を代入 */ Cha_hinsi[idx].daughter = daughter; for (; !nullp(cell); cell = cha_cdr(cell)) { daughter[ndaughter++] = d; --- 114,122 ---- short daughter[256]; int ndaughter = 0; d = idx + 1; ! /* ! * 品詞の二重登録のチェックのため一時的に daughter を代入 ! */ Cha_hinsi[idx].daughter = daughter; for (; !nullp(cell); cell = cha_cdr(cell)) { daughter[ndaughter++] = d; *************** *** 116,139 **** } daughter[ndaughter++] = 0; Cha_hinsi[idx].daughter = cha_malloc(sizeof(short) * ndaughter); ! memcpy(Cha_hinsi[idx].daughter, daughter, sizeof(short) * ndaughter); idx = d; } return idx; } ! /*********************************************************************** * cha_read_class ! ***********************************************************************/ ! void cha_read_class(FILE *fp) { static short path0 = 0; chasen_cell_t *cell1; short daughter[256]; int idx, ndaughter; ! /* root node */ Cha_hinsi[0].path = &path0; Cha_hinsi[0].depth = 0; Cha_hinsi[0].kt = 0; --- 125,152 ---- } daughter[ndaughter++] = 0; Cha_hinsi[idx].daughter = cha_malloc(sizeof(short) * ndaughter); ! memcpy(Cha_hinsi[idx].daughter, daughter, ! sizeof(short) * ndaughter); idx = d; } return idx; } ! /* * cha_read_class ! */ ! void ! cha_read_class(FILE * fp) { static short path0 = 0; chasen_cell_t *cell1; short daughter[256]; int idx, ndaughter; ! /* ! * root node ! */ Cha_hinsi[0].path = &path0; Cha_hinsi[0].depth = 0; Cha_hinsi[0].kt = 0; *************** *** 142,148 **** idx = 1; ndaughter = 0; ! /* 品詞の二重登録のチェックのため一時的に daughter を代入 */ Cha_hinsi[0].daughter = daughter; while (!cha_s_feof(fp)) { if (!nullp(cell1 = cha_s_read(fp))) { --- 155,163 ---- idx = 1; ndaughter = 0; ! /* ! * 品詞の二重登録のチェックのため一時的に daughter を代入 ! */ Cha_hinsi[0].daughter = daughter; while (!cha_s_feof(fp)) { if (!nullp(cell1 = cha_s_read(fp))) { *************** *** 152,184 **** } } - #if 0 - { - /* 文節区切りを追加 */ - chasen_cell_t *cell2 = cha_cons(cha_tmp_atom(JSTR_BUNSETSU_KUGIRI), NIL); - daughter[ndaughter++] = idx; - daughter[ndaughter] = 0; - idx = make_hinsi(cell2, 0, idx); - } - #endif - daughter[ndaughter++] = 0; Cha_hinsi[0].daughter = cha_malloc(sizeof(short) * ndaughter); memcpy(Cha_hinsi[0].daughter, daughter, sizeof(short) * ndaughter); ! /* last node */ Cha_hinsi[idx].name = NULL; } ! /*********************************************************************** * cha_match_nhinsi - cellのwildcard表現がhinsiとマッチしているかどうか ! ***********************************************************************/ ! int cha_match_nhinsi(chasen_cell_t *cell, int hinsi) { char *name; short *path; ! for (path = Cha_hinsi[hinsi].path; !nullp(cell); path++, cell = cha_cdr(cell)) { name = cha_s_atom(cha_car(cell)); if (!*path) { /* --- 167,193 ---- } } daughter[ndaughter++] = 0; Cha_hinsi[0].daughter = cha_malloc(sizeof(short) * ndaughter); memcpy(Cha_hinsi[0].daughter, daughter, sizeof(short) * ndaughter); ! /* ! * last node ! */ Cha_hinsi[idx].name = NULL; } ! /* * cha_match_nhinsi - cellのwildcard表現がhinsiとマッチしているかどうか ! */ ! int ! cha_match_nhinsi(chasen_cell_t * cell, int hinsi) { char *name; short *path; ! for (path = Cha_hinsi[hinsi].path; !nullp(cell); ! path++, cell = cha_cdr(cell)) { name = cha_s_atom(cha_car(cell)); if (!*path) { /* *************** *** 188,221 **** * connect.cha でも使われる可能性がある */ if (strcmp(name, "*")) ! return 0; ! /* これ以降は *path の値が 0 になるようにする */ path--; } else { if (strcmp(name, "*") && strcmp(name, Cha_hinsi[*path].name)) ! return 0; } } ! /* cell が hinsi よりも粗い分類ならマッチ */ return 1; } ! /*********************************************************************** * cha_read_grammar - read GRAMMAR_FILE and set Class[][] * * inputs: * dir - 0: read from current directory * 1: read from grammar directory * 2: read from current directory or grammar directory ! ***********************************************************************/ ! void cha_read_grammar(FILE *fp_out, int ret, int dir) { FILE *fp; char *filepath; fp = cha_fopen_grammar(GRAMMAR_FILE, "r", ret, dir, &filepath); if (fp_out != NULL) ! fprintf(fp_out, "parsing %s\n", filepath); cha_read_class(fp); --- 197,235 ---- * connect.cha でも使われる可能性がある */ if (strcmp(name, "*")) ! return 0; ! /* ! * これ以降は *path の値が 0 になるようにする ! */ path--; } else { if (strcmp(name, "*") && strcmp(name, Cha_hinsi[*path].name)) ! return 0; } } ! /* ! * cell が hinsi よりも粗い分類ならマッチ ! */ return 1; } ! /* * cha_read_grammar - read GRAMMAR_FILE and set Class[][] * * inputs: * dir - 0: read from current directory * 1: read from grammar directory * 2: read from current directory or grammar directory ! */ ! void ! cha_read_grammar(FILE * fp_out, int ret, int dir) { FILE *fp; char *filepath; fp = cha_fopen_grammar(GRAMMAR_FILE, "r", ret, dir, &filepath); if (fp_out != NULL) ! fprintf(fp_out, "parsing %s\n", filepath); cha_read_class(fp); diff -crN chasen-2.2.3/lib/htobe.c chasen-2.2.4/lib/htobe.c *** chasen-2.2.3/lib/htobe.c Thu Jan 1 09:00:00 1970 --- chasen-2.2.4/lib/htobe.c Tue Mar 20 05:16:04 2001 *************** *** 0 **** --- 1,75 ---- + /* + * Big Endian <-> Host byte order converter + * + * Copyright (C) 1996, 1997, 2000, 2001, + * Nara Institute of Science and Technology + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by Nara Institute of + * Science and Technology. + * 4. The name Nara Institute of Science and Technology may not be used to + * endorse or promote products derived from this software without specific + * prior written permission. + * + * + * THIS SOFTWARE IS PROVIDED BY Nara Institute of Science and Technology + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A + * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE Nara Institute + * of Science and Technology BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED + * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $Id: htobe.c,v 1.1.2.1 2001/03/19 20:16:04 masayu-a Exp $ + */ + + #include "htobe.h" + + #if !defined WORDS_BIGENDIAN && !defined HAVE_HTONL + + union clong { + unsigned long ulong; + unsigned char uchar[sizeof(unsigned long)]; + }; + + unsigned long + __htobe(unsigned long hostlong) + { + int i; + union clong cl; + + for (i = 0; i < sizeof(unsigned long); i++) { + cl.uchar[i] = (hostlong >> (i * 8)) & 0xff; + } + + return cl.ulong; + } + + unsigned long + __betoh(unsigned long belong) + { + int i; + union clong cl; + + for (i = sizeof(unsigned long) - 1; i >= 0; i--) { + cl.uchar[i] = (belong >> (i * 8)) & 0xff; + } + + return cl.ulong; + } + #endif diff -crN chasen-2.2.3/lib/htobe.h chasen-2.2.4/lib/htobe.h *** chasen-2.2.3/lib/htobe.h Thu Jan 1 09:00:00 1970 --- chasen-2.2.4/lib/htobe.h Tue Mar 20 05:16:04 2001 *************** *** 0 **** --- 1,37 ---- + /* + * Big Endian <-> Host byte order converter + * + * $Id: htobe.h,v 1.1.2.1 2001/03/19 20:16:04 masayu-a Exp $ + */ + + #include "config.h" + + #ifndef __HTOBE_H__ + #define __HTOBE_H__ + + #ifndef WORDS_BIGENDIAN + #ifdef HAVE_HTONL + #if HAVE_WINSOCK2_H + #include /* MSWindows htonl() etc */ + #endif /* HAVE_WINSOCK2_H */ + #if HAVE_SYS_PARAM_H + #include /* FreeBSD htonl() etc */ + #endif /* HAVE_SYS_PARAM_H */ + #if HAVE_SYS_TYPES_H + /* At least SunOS4 needs + to include sys/types.h before netinet/in.h. There have also + been a problem report for FreeBSD which seems to indicate + the same dependency on that platform aswell. */ + #include + #endif /* HAVE_SYS_TYPES_H */ + #if HAVE_NETINET_IN_H + #include /* Linux htonl() etc */ + #endif /* HAVE_NETINET_IN_H */ + #define htobe htonl + #define betoh ntohl + #else /* HAVE_HTONL */ + #define htobe __htobe + #define betoh __betoh + #endif /* HAVE_HTONL */ + #endif /* WORDS_BIGENDIAN */ + #endif /* __HTOBE_H__ */ diff -crN chasen-2.2.3/lib/init.c chasen-2.2.4/lib/init.c *** chasen-2.2.3/lib/init.c Fri Feb 23 20:50:10 2001 --- chasen-2.2.4/lib/init.c Fri Mar 16 06:25:48 2001 *************** *** 35,47 **** * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * * Modified by A.Kitauchi Sep. 1996 ! * $Id: init.c,v 1.8 2001/02/23 11:50:10 kazuma-t Exp $ */ #include "chalib.h" #include "pat.h" ! /* .chasenrc default values */ #define POS_COST_DEFAULT 1 #define RENSETSU_WEIGHT_DEFAULT 1 #define KEITAISO_WEIGHT_DEFAULT 1 --- 35,50 ---- * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * * Modified by A.Kitauchi Sep. 1996 ! * $Id: init.c,v 1.12 2001/03/15 21:25:48 masayu-a Exp $ */ #include "chalib.h" #include "pat.h" + #include "tokenizer.h" ! /* ! * .chasenrc default values ! */ #define POS_COST_DEFAULT 1 #define RENSETSU_WEIGHT_DEFAULT 1 #define KEITAISO_WEIGHT_DEFAULT 1 *************** *** 57,63 **** anno_info Cha_anno_info[UNDEF_HINSI_MAX]; undef_info Cha_undef_info[UNDEF_HINSI_MAX]; int Cha_undef_info_num = 0; ! int Cha_output_compo = 1; char *Cha_bos_string = ""; char *Cha_eos_string = "EOS\n"; --- 60,66 ---- anno_info Cha_anno_info[UNDEF_HINSI_MAX]; undef_info Cha_undef_info[UNDEF_HINSI_MAX]; int Cha_undef_info_num = 0; ! int Cha_output_iscompound = 1; char *Cha_bos_string = ""; char *Cha_eos_string = "EOS\n"; *************** *** 65,72 **** extern int Suf_ndicfile; extern int Pat_ndicfile; ! /***********************************************************************/ ! static void read_class_cost(chasen_cell_t *cell) { int hinsi, cost; --- 68,75 ---- extern int Suf_ndicfile; extern int Pat_ndicfile; ! static void ! read_class_cost(chasen_cell_t * cell) { int hinsi, cost; *************** *** 74,98 **** chasen_cell_t *cell1 = cha_car(cha_car(cell)); chasen_cell_t *cell2 = cha_cdr(cha_car(cell)); char *s = cha_s_atom(cha_car(cell1)); ! if (strmatch3(s, JSTR_UNKNOWN_WORD1, JSTR_UNKNOWN_WORD2, ESTR_UNKNOWN_WORD)) { int i; ! for (i=0; i i) ! Cha_undef_info_num = i; } else if (!strcmp(s, "*")) { cost = atoi(cha_s_atom(cha_car(cell2))); for (hinsi = 1; Cha_hinsi[hinsi].name; hinsi++) ! if (Cha_hinsi[hinsi].cost == 0) ! Cha_hinsi[hinsi].cost = cost; } else { int match = 0; cost = atoi(cha_s_atom(cha_car(cell2))); --- 77,106 ---- chasen_cell_t *cell1 = cha_car(cha_car(cell)); chasen_cell_t *cell2 = cha_cdr(cha_car(cell)); char *s = cha_s_atom(cha_car(cell1)); ! if (strmatch3 ! (s, JSTR_UNKNOWN_WORD1, JSTR_UNKNOWN_WORD2, ! ESTR_UNKNOWN_WORD)) { int i; ! for (i = 0; i < UNDEF_HINSI_MAX && !nullp(cell2); ! i++, cell2 = cha_cdr(cell2)) { chasen_cell_t *cell3 = cha_car(cell2); if (atomp(cell3)) { Cha_undef_info[i].cost = atoi(cha_s_atom(cell3)); Cha_undef_info[i].cost_step = 0; } else { ! Cha_undef_info[i].cost = ! atoi(cha_s_atom(cha_car(cell3))); ! Cha_undef_info[i].cost_step = ! atoi(cha_s_atom(cha_car(cha_cdr(cell3)))); } ! } if (Cha_undef_info_num == 0 || Cha_undef_info_num > i) ! Cha_undef_info_num = i; } else if (!strcmp(s, "*")) { cost = atoi(cha_s_atom(cha_car(cell2))); for (hinsi = 1; Cha_hinsi[hinsi].name; hinsi++) ! if (Cha_hinsi[hinsi].cost == 0) ! Cha_hinsi[hinsi].cost = cost; } else { int match = 0; cost = atoi(cha_s_atom(cha_car(cell2))); *************** *** 103,143 **** } } if (!match) ! cha_exit_file(1, "invalid hinsi name `%s'\n", cha_s_tostr(cell1)); } } ! /* default */ ! for (hinsi = 1; Cha_hinsi[hinsi].name; hinsi++) ! if (Cha_hinsi[hinsi].cost == 0) ! Cha_hinsi[hinsi].cost = POS_COST_DEFAULT; ! ! /* 文頭 文末 */ Cha_hinsi[0].cost = 0; } ! /***********************************************************************/ ! static void read_composition(chasen_cell_t *cell) { ! int comp, pos; chasen_cell_t *cell2, *cell3; for (; !nullp(cell); cell = cha_cdr(cell)) { cell2 = cha_car(cell); ! comp = cha_get_nhinsi_id(cha_car(cell2)); if (!nullp(cha_cdr(cell2))) ! cell2 = cha_cdr(cell2); for (; !nullp(cell2); cell2 = cha_cdr(cell2)) { cell3 = cha_car(cell2); for (pos = 1; Cha_hinsi[pos].name; pos++) ! if (cha_match_nhinsi(cell3, pos)) ! Cha_hinsi[pos].comp = comp; } } } ! /***********************************************************************/ ! static void eval_chasenrc_sexp(chasen_cell_t *cell) { char *cell1_str; chasen_cell_t *cell2; --- 111,156 ---- } } if (!match) ! cha_exit_file(1, "invalid hinsi name `%s'\n", ! cha_s_tostr(cell1)); } } ! /* ! * default ! */ ! for (hinsi = 1; Cha_hinsi[hinsi].name; hinsi++) ! if (Cha_hinsi[hinsi].cost == 0) ! Cha_hinsi[hinsi].cost = POS_COST_DEFAULT; ! ! /* ! * 文頭 文末 ! */ Cha_hinsi[0].cost = 0; } ! static void ! read_composition(chasen_cell_t * cell) { ! int composit, pos; chasen_cell_t *cell2, *cell3; for (; !nullp(cell); cell = cha_cdr(cell)) { cell2 = cha_car(cell); ! composit = cha_get_nhinsi_id(cha_car(cell2)); if (!nullp(cha_cdr(cell2))) ! cell2 = cha_cdr(cell2); for (; !nullp(cell2); cell2 = cha_cdr(cell2)) { cell3 = cha_car(cell2); for (pos = 1; Cha_hinsi[pos].name; pos++) ! if (cha_match_nhinsi(cell3, pos)) ! Cha_hinsi[pos].composit = composit; } } } ! static void ! eval_chasenrc_sexp(chasen_cell_t * cell) { char *cell1_str; chasen_cell_t *cell2; *************** *** 145,162 **** cell1_str = cha_s_atom(cha_car(cell)); cell2 = cha_car(cha_cdr(cell)); if (Cha_errno) ! return; ! /* 辞書ファイル(patdic, sufdic) */ if (!strcmp(cell1_str, ESTR_PAT_FILE)) ! cha_read_patdic(cha_cdr(cell)); else if (!strcmp(cell1_str, ESTR_SUF_FILE)) ! cha_read_sufdic(cha_cdr(cell)); ! /* 空白品詞(space pos) */ else if (strmatch2(cell1_str, JSTR_SPACE_POS, ESTR_SPACE_POS)) { Cha_anno_info[0].hinsi = cha_get_nhinsi_id(cell2); } ! /* 注釈(annotation) */ else if (strmatch2(cell1_str, JSTR_ANNOTATION, ESTR_ANNOTATION)) { int i; for (i = 1, cell2 = cha_cdr(cell); --- 158,181 ---- cell1_str = cha_s_atom(cha_car(cell)); cell2 = cha_car(cha_cdr(cell)); if (Cha_errno) ! return; ! /* ! * 辞書ファイル(patdic, sufdic) ! */ if (!strcmp(cell1_str, ESTR_PAT_FILE)) ! cha_read_patdic(cha_cdr(cell)); else if (!strcmp(cell1_str, ESTR_SUF_FILE)) ! cha_read_sufdic(cha_cdr(cell)); ! /* ! * 空白品詞(space pos) ! */ else if (strmatch2(cell1_str, JSTR_SPACE_POS, ESTR_SPACE_POS)) { Cha_anno_info[0].hinsi = cha_get_nhinsi_id(cell2); } ! /* ! * 注釈(annotation) ! */ else if (strmatch2(cell1_str, JSTR_ANNOTATION, ESTR_ANNOTATION)) { int i; for (i = 1, cell2 = cha_cdr(cell); *************** *** 164,191 **** i++, cell2 = cha_cdr(cell2)) { chasen_cell_t *cell3 = cha_car(cell2); chasen_cell_t *cell4; ! /* str1, len1 */ Cha_anno_info[i].str1 = cha_s_atom(cha_car(cha_car(cell3))); Cha_anno_info[i].len1 = strlen(Cha_anno_info[i].str1); cell4 = cha_car(cha_cdr(cha_car(cell3))); ! /* str2, len2 */ Cha_anno_info[i].str2 = nullp(cell4) ? "" : cha_s_atom(cell4); Cha_anno_info[i].len2 = strlen(Cha_anno_info[i].str2); ! /* hinsi */ cell4 = cha_car(cha_cdr(cell3)); if (!nullp(cell4)) { if (atomp(cell4)) { ! /* format string */ Cha_anno_info[i].format = cha_s_atom(cell4); } else { ! /* pos */ Cha_anno_info[i].hinsi = cha_get_nhinsi_id(cell4); } } } } ! /* 未知語品詞 */ ! else if (strmatch3(cell1_str, JSTR_UNKNOWN_POS1, JSTR_UNKNOWN_POS2, ESTR_UNKNOWN_POS)) { int i; cell2 = cha_cdr(cell); for (i = 0; i < UNDEF_HINSI_MAX && !nullp(cell2); --- 183,224 ---- i++, cell2 = cha_cdr(cell2)) { chasen_cell_t *cell3 = cha_car(cell2); chasen_cell_t *cell4; ! /* ! * str1, len1 ! */ Cha_anno_info[i].str1 = cha_s_atom(cha_car(cha_car(cell3))); Cha_anno_info[i].len1 = strlen(Cha_anno_info[i].str1); cell4 = cha_car(cha_cdr(cha_car(cell3))); ! /* ! * str2, len2 ! */ Cha_anno_info[i].str2 = nullp(cell4) ? "" : cha_s_atom(cell4); Cha_anno_info[i].len2 = strlen(Cha_anno_info[i].str2); ! /* ! * hinsi ! */ cell4 = cha_car(cha_cdr(cell3)); if (!nullp(cell4)) { if (atomp(cell4)) { ! /* ! * format string ! */ Cha_anno_info[i].format = cha_s_atom(cell4); } else { ! /* ! * pos ! */ Cha_anno_info[i].hinsi = cha_get_nhinsi_id(cell4); } } } } ! /* ! * 未知語品詞 ! */ ! else if (strmatch3 ! (cell1_str, JSTR_UNKNOWN_POS1, JSTR_UNKNOWN_POS2, ! ESTR_UNKNOWN_POS)) { int i; cell2 = cha_cdr(cell); for (i = 0; i < UNDEF_HINSI_MAX && !nullp(cell2); *************** *** 193,266 **** Cha_undef_info[i].hinsi = cha_get_nhinsi_id(cha_car(cell2)); } if (Cha_undef_info_num == 0 || Cha_undef_info_num > i) ! Cha_undef_info_num = i; } ! /* 連接コスト重み */ else if (strmatch2(cell1_str, JSTR_CONN_WEIGHT, ESTR_CONN_WEIGHT)) ! Cha_con_cost_weight = atoi(cha_s_atom(cell2)) * MRPH_DEFAULT_WEIGHT; ! /* 形態素コスト重み */ else if (strmatch2(cell1_str, JSTR_MRPH_WEIGHT, ESTR_MRPH_WEIGHT)) ! Cha_mrph_cost_weight = atoi(cha_s_atom(cell2)); ! /* コスト幅 */ else if (strmatch2(cell1_str, JSTR_COST_WIDTH, ESTR_COST_WIDTH)) ! cha_set_cost_width(atoi(cha_s_atom(cell2))); ! /* 品詞コスト */ else if (strmatch2(cell1_str, JSTR_POS_COST, ESTR_POS_COST)) ! read_class_cost(cha_cdr(cell)); ! /* 未定義連接コスト */ else if (strmatch2(cell1_str, JSTR_DEF_CONN_COST, ESTR_DEF_CONN_COST)) ! Cha_con_cost_undef = (int) atoi(cha_s_atom(cell2)); ! /* 連結品詞 */ ! else if (strmatch2(cell1_str, JSTR_COMPO_POS, ESTR_COMPO_POS)) ! read_composition(cha_cdr(cell)); ! /* 複合語 */ ! else if (strmatch2(cell1_str, JSTR_OUTPUT_COMPO, ESTR_OUTPUT_COMPO)) ! Cha_output_compo = strmatch2(cha_s_atom(cell2), JSTR_SEG, ESTR_SEG) ? 0 : 1; ! /* 出力フォーマット */ else if (strmatch2(cell1_str, JSTR_OUTPUT_FORMAT, ESTR_OUTPUT_FORMAT)) ! cha_set_opt_form(cha_s_atom(cell2)); ! /* 言語 */ else if (strmatch2(cell1_str, JSTR_LANG, ESTR_LANG)) ! cha_set_language(cha_s_atom(cell2)); ! /* BOS文字列 */ else if (strmatch2(cell1_str, JSTR_BOS_STR, ESTR_BOS_STR)) ! Cha_bos_string = cha_s_atom(cell2); ! /* EOS文字列 */ else if (strmatch2(cell1_str, JSTR_EOS_STR, ESTR_EOS_STR)) ! Cha_eos_string = cha_s_atom(cell2); ! /* 区切り文字 */ else if (strmatch2(cell1_str, JSTR_DELIMITER, ESTR_DELIMITER)) ! cha_set_jfgets_delimiter(cha_s_atom(cell2)); } ! /*********************************************************************** * cha_read_rcfile_fp() ! ***********************************************************************/ ! void cha_read_rcfile_fp(FILE *fp) { chasen_cell_t *cell; while (!cha_s_feof(fp)) { cell = cha_s_read(fp); if (!Cha_errno) ! eval_chasenrc_sexp(cell); ! } ! ! /* default language */ ! if (!Cha_lang_j && !Cha_lang_e) { ! Cha_lang_j = 1; ! Cha_encode = CHA_ENCODE_EUC; ! #if 0 ! if (!Suf_ndicfile) ! Cha_lang_j = 1; ! else ! Cha_lang_j = Cha_lang_e = 1; ! #endif } } ! /***********************************************************************/ ! static void read_chasenrc(void) { FILE *fp; char *rcpath; --- 226,314 ---- Cha_undef_info[i].hinsi = cha_get_nhinsi_id(cha_car(cell2)); } if (Cha_undef_info_num == 0 || Cha_undef_info_num > i) ! Cha_undef_info_num = i; } ! /* ! * 連接コスト重み ! */ else if (strmatch2(cell1_str, JSTR_CONN_WEIGHT, ESTR_CONN_WEIGHT)) ! Cha_con_cost_weight = ! atoi(cha_s_atom(cell2)) * MRPH_DEFAULT_WEIGHT; ! /* ! * 形態素コスト重み ! */ else if (strmatch2(cell1_str, JSTR_MRPH_WEIGHT, ESTR_MRPH_WEIGHT)) ! Cha_mrph_cost_weight = atoi(cha_s_atom(cell2)); ! /* ! * コスト幅 ! */ else if (strmatch2(cell1_str, JSTR_COST_WIDTH, ESTR_COST_WIDTH)) ! cha_set_cost_width(atoi(cha_s_atom(cell2))); ! /* ! * 品詞コスト ! */ else if (strmatch2(cell1_str, JSTR_POS_COST, ESTR_POS_COST)) ! read_class_cost(cha_cdr(cell)); ! /* ! * 未定義連接コスト ! */ else if (strmatch2(cell1_str, JSTR_DEF_CONN_COST, ESTR_DEF_CONN_COST)) ! Cha_con_cost_undef = (int) atoi(cha_s_atom(cell2)); ! /* ! * 連結品詞 ! */ ! else if (strmatch2(cell1_str, JSTR_COMPOSIT_POS, ESTR_COMPOSIT_POS)) ! read_composition(cha_cdr(cell)); ! /* ! * 複合語 ! */ ! else if (strmatch2(cell1_str, JSTR_OUTPUT_COMPOUND, ESTR_OUTPUT_COMPOUND)) ! Cha_output_iscompound = ! strmatch2(cha_s_atom(cell2), JSTR_SEG, ESTR_SEG) ? 0 : 1; ! /* ! * 出力フォーマット ! */ else if (strmatch2(cell1_str, JSTR_OUTPUT_FORMAT, ESTR_OUTPUT_FORMAT)) ! cha_set_opt_form(cha_s_atom(cell2)); ! /* ! * 言語 ! */ else if (strmatch2(cell1_str, JSTR_LANG, ESTR_LANG)) ! cha_set_language(cha_s_atom(cell2)); ! /* ! * BOS文字列 ! */ else if (strmatch2(cell1_str, JSTR_BOS_STR, ESTR_BOS_STR)) ! Cha_bos_string = cha_s_atom(cell2); ! /* ! * EOS文字列 ! */ else if (strmatch2(cell1_str, JSTR_EOS_STR, ESTR_EOS_STR)) ! Cha_eos_string = cha_s_atom(cell2); ! /* ! * 区切り文字 ! */ else if (strmatch2(cell1_str, JSTR_DELIMITER, ESTR_DELIMITER)) ! cha_set_jfgets_delimiter(cha_s_atom(cell2)); } ! /* * cha_read_rcfile_fp() ! */ ! void ! cha_read_rcfile_fp(FILE * fp) { chasen_cell_t *cell; while (!cha_s_feof(fp)) { cell = cha_s_read(fp); if (!Cha_errno) ! eval_chasenrc_sexp(cell); } } ! static void ! read_chasenrc(void) { FILE *fp; char *rcpath; *************** *** 271,300 **** cha_read_rcfile_fp(fp); fclose(fp); ! /* required options */ if (!Cha_undef_info[0].hinsi) ! cha_exit(1, "%s: UNKNOWN_POS/michigo-hinsi is not specified", ! cha_get_rcpath()); if (!Pat_ndicfile && !Suf_ndicfile) ! cha_exit(1, "%s: patricia dictionary is not specified", ! cha_get_rcpath()); } /* * cha_init - ChaSen's initialization */ ! void cha_init(void) { int i; ! /* 出力関数へのポインタ */ cha_set_fput(Cha_server_mode); ! /* 入力関数へのポインタ */ cha_set_getc_alone(); ! /* cost width */ cha_set_cost_width(COST_WIDTH_DEFAULT); cha_read_grammar_dir(); --- 319,357 ---- cha_read_rcfile_fp(fp); fclose(fp); ! /* ! * required options ! */ if (!Cha_undef_info[0].hinsi) ! cha_exit(1, "%s: UNKNOWN_POS/michigo-hinsi is not specified", ! cha_get_rcpath()); if (!Pat_ndicfile && !Suf_ndicfile) ! cha_exit(1, "%s: patricia dictionary is not specified", ! cha_get_rcpath()); } /* * cha_init - ChaSen's initialization */ ! void ! cha_init(void) { int i; ! /* ! * 出力関数へのポインタ ! */ cha_set_fput(Cha_server_mode); ! /* ! * 入力関数へのポインタ ! */ cha_set_getc_alone(); ! /* ! * cost width ! */ cha_set_cost_width(COST_WIDTH_DEFAULT); cha_read_grammar_dir(); *************** *** 307,311 **** cha_read_matrix(NULL); for (i = 0; i < Cha_undef_info_num; i++) ! Cha_undef_info[i].con_tbl = cha_check_table_for_undef(Cha_undef_info[i].hinsi); } --- 364,369 ---- cha_read_matrix(NULL); for (i = 0; i < Cha_undef_info_num; i++) ! Cha_undef_info[i].con_tbl = ! cha_check_table_for_undef(Cha_undef_info[i].hinsi); } diff -crN chasen-2.2.3/lib/iotool.c chasen-2.2.4/lib/iotool.c *** chasen-2.2.3/lib/iotool.c Fri Feb 23 21:51:34 2001 --- chasen-2.2.4/lib/iotool.c Sat Feb 24 15:17:22 2001 *************** *** 34,46 **** * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * ! * 1990/12/14/Fri Yutaka MYOKI(Nagao Lab., KUEE) ! * 1990/12/25/Tue Modified * Oct. 1996 A.Kitauchi ! * $Id: iotool.c,v 1.10 2001/02/23 12:51:34 kazuma-t Exp $ */ - #include #include #include "chadic.h" #ifdef PATHTYPE_MSDOS --- 34,46 ---- * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * ! * 1990/12/14/Fri Yutaka MYOKI(Nagao Lab., KUEE) ! * 1990/12/25/Tue Modified * Oct. 1996 A.Kitauchi ! * $Id: iotool.c,v 1.11 2001/02/24 06:17:22 kazuma-t Exp $ */ #include + #include #include "chadic.h" #ifdef PATHTYPE_MSDOS *************** *** 62,86 **** static char grammar_dir[CHA_FILENAME_LEN]; static char chasenrc_path[CHA_FILENAME_LEN]; ! /*********************************************************************** * cha_convert_escape - convert escape characters ! ***********************************************************************/ ! char *cha_convert_escape(char *str, int ctrl_only) { char *s1, *s2; for (s1 = s2 = str; *s1; s1++, s2++) { if (*s1 != '\\') ! *s2 = *s1; else { switch (*++s1) { ! case 't': ! *s2 = '\t'; break; ! case 'n': ! *s2 = '\n'; break; ! default: if (ctrl_only) ! *s2++ = '\\'; *s2 = *s1; break; } --- 62,89 ---- static char grammar_dir[CHA_FILENAME_LEN]; static char chasenrc_path[CHA_FILENAME_LEN]; ! /* * cha_convert_escape - convert escape characters ! */ ! char * ! cha_convert_escape(char *str, int ctrl_only) { char *s1, *s2; for (s1 = s2 = str; *s1; s1++, s2++) { if (*s1 != '\\') ! *s2 = *s1; else { switch (*++s1) { ! case 't': ! *s2 = '\t'; ! break; ! case 'n': ! *s2 = '\n'; ! break; ! default: if (ctrl_only) ! *s2++ = '\\'; *s2 = *s1; break; } *************** *** 91,114 **** return str; } ! /*********************************************************************** * cha_set_progpath - set program pathname * * progpath is used in cha_exit() and cha_exit_file() ! ***********************************************************************/ ! #if defined _WIN32 && ! defined __CYGWIN__ ! static void which(char *filename, char *path) { char *ps, *pe; int fl; strcpy(path, ".\\"); strcat(path, filename); ! if (fopen(path, "r") != NULL) return; ps = getenv("PATH"); ! for (pe = ps, fl = 0;!fl ; pe++) { if (*pe == '\0') { *pe = ';'; fl = 1; --- 94,119 ---- return str; } ! #if defined _WIN32 && ! defined __CYGWIN__ ! /* * cha_set_progpath - set program pathname * * progpath is used in cha_exit() and cha_exit_file() ! */ ! static void ! which(char *filename, char *path) { char *ps, *pe; int fl; strcpy(path, ".\\"); strcat(path, filename); ! if (fopen(path, "r") != NULL) ! return; ps = getenv("PATH"); ! for (pe = ps, fl = 0; !fl; pe++) { if (*pe == '\0') { *pe = ';'; fl = 1; *************** *** 117,222 **** *pe = '\0'; strcpy(path, ps); if (pe[-1] != '\\') ! strcat(path, "\\"); strcat(path, filename); ! if (fopen(path, "r") != NULL) return; ps = pe + 1; } } } #endif /* _WIN32 */ ! void cha_set_progpath(char *path) { #if defined _WIN32 && ! defined __CYGWIN__ if (strchr(path, PATH_DELIMITER) != NULL) ! strcpy(progpath, path); else ! which("chasen.exe", progpath); ! #else strcpy(progpath, path); ! #endif } ! /*********************************************************************** * cha_set_rcpath - set chasenrc file path * * this function is called when -r option is used. ! ***********************************************************************/ ! void cha_set_rcpath(char *filename) { strcpy(chasenrc_path, filename); ! } ! /*********************************************************************** * cha_get_rcpath * * called only from chasen.c ! ***********************************************************************/ ! char *cha_get_rcpath(void) { return chasenrc_path; } ! /*********************************************************************** * cha_get_grammar_dir * * called only from chasen.c ! ***********************************************************************/ ! char *cha_get_grammar_dir(void) { return grammar_dir; } ! void cha_set_filepath(char *filename) { strcpy(filepath, filename); Cha_lineno = Cha_lineno_error = 0; } ! /*********************************************************************** * cha_fopen - open file, or error end * * inputs: * ret - exit code (don't exit if ret < 0) ! ***********************************************************************/ ! FILE *cha_fopen(char *filename, char *mode, int ret) { FILE *fp; if (filename[0] == '-' && filename[1] == '\0') ! return stdin; if ((fp = fopen(filename, mode)) != NULL) { ! /* filepath is used in cha_exit_file() */ if (*mode == 'r') { if (filename != filepath) ! strcpy(filepath, filename); Cha_lineno = Cha_lineno_error = 0; } } else if (ret >= 0) ! cha_exit_perror(filename); return fp; } ! FILE *cha_fopen2(char *filename1, char *filename2, char *mode, int ret) { FILE *fp; if ((fp = cha_fopen(filename1, mode, -1)) != NULL) ! return fp; if ((fp = cha_fopen(filename2, mode, -1)) != NULL) ! return fp; cha_exit(ret, "can't open %s or %s", filename1, filename2); ! /* to avoid warning */ return NULL; } ! /*********************************************************************** * cha_fopen_grammar - open file from current or grammar directory * * inputs: --- 122,239 ---- *pe = '\0'; strcpy(path, ps); if (pe[-1] != '\\') ! strcat(path, "\\"); strcat(path, filename); ! if (fopen(path, "r") != NULL) ! return; ps = pe + 1; } } } #endif /* _WIN32 */ ! void ! cha_set_progpath(char *path) { #if defined _WIN32 && ! defined __CYGWIN__ if (strchr(path, PATH_DELIMITER) != NULL) ! strcpy(progpath, path); else ! which("chasen.exe", progpath); ! #else /* not _WIN32 */ strcpy(progpath, path); ! #endif /* _WIN32 */ } ! /* * cha_set_rcpath - set chasenrc file path * * this function is called when -r option is used. ! */ ! void ! cha_set_rcpath(char *filename) { strcpy(chasenrc_path, filename); ! } ! /* * cha_get_rcpath * * called only from chasen.c ! */ ! char * ! cha_get_rcpath(void) { return chasenrc_path; } ! /* * cha_get_grammar_dir * * called only from chasen.c ! */ ! char * ! cha_get_grammar_dir(void) { return grammar_dir; } ! void ! cha_set_filepath(char *filename) { strcpy(filepath, filename); Cha_lineno = Cha_lineno_error = 0; } ! /* * cha_fopen - open file, or error end * * inputs: * ret - exit code (don't exit if ret < 0) ! */ ! FILE * ! cha_fopen(char *filename, char *mode, int ret) { FILE *fp; if (filename[0] == '-' && filename[1] == '\0') ! return stdin; if ((fp = fopen(filename, mode)) != NULL) { ! /* ! * filepath is used in cha_exit_file() ! */ if (*mode == 'r') { if (filename != filepath) ! strcpy(filepath, filename); Cha_lineno = Cha_lineno_error = 0; } } else if (ret >= 0) ! cha_exit_perror(filename); return fp; } ! FILE * ! cha_fopen2(char *filename1, char *filename2, char *mode, int ret) { FILE *fp; if ((fp = cha_fopen(filename1, mode, -1)) != NULL) ! return fp; if ((fp = cha_fopen(filename2, mode, -1)) != NULL) ! return fp; cha_exit(ret, "can't open %s or %s", filename1, filename2); ! /* ! * to avoid warning ! */ return NULL; } ! /* * cha_fopen_grammar - open file from current or grammar directory * * inputs: *************** *** 228,316 **** * * outputs: * filepathp - file path string ! ***********************************************************************/ ! FILE *cha_fopen_grammar(char *filename, char *mode, int ret, int dir, char **filepathp) { FILE *fp; *filepathp = filename; switch (dir) { ! case 0: ! /* カレントディレクトリから読み込む */ return cha_fopen(filename, mode, ret); ! case 2: ! /* カレントディレクトリから読み込む */ if ((fp = cha_fopen(filename, mode, -1)) != NULL) ! return fp; ! /* FALLTHRU */ ! default: /* should be 1 */ ! /* 文法ディレクトリから読み込む */ ! /* 文法ディレクトリが設定されていなければ .chasenrc を読み込む */ if (grammar_dir[0] == '\0') ! cha_read_grammar_dir(); sprintf(filepath, "%s%s", grammar_dir, filename); *filepathp = filepath; return cha_fopen(filepath, mode, ret); } } ! FILE *cha_fopen_grammar2(char *filename1, char *filename2, char *mode, int ret, int dir, char **filepathp) { FILE *fp; if (dir == 2) { ! if ((fp = cha_fopen_grammar(filename1, mode, -1, 0, filepathp)) != NULL) ! return fp; ! if ((fp = cha_fopen_grammar(filename2, mode, -1, 0, filepathp)) != NULL) ! return fp; ! if ((fp = cha_fopen_grammar(filename1, mode, -1, 1, filepathp)) != NULL) ! return fp; ! if ((fp = cha_fopen_grammar(filename2, mode, -1, 1, filepathp)) != NULL) ! return fp; } else { ! if ((fp = cha_fopen_grammar(filename1, mode, -1, dir, filepathp)) != NULL) ! return fp; ! if ((fp = cha_fopen_grammar(filename2, mode, -1, dir, filepathp)) != NULL) ! return fp; } cha_exit(ret, "can't open %s or %s", filename1, filename2); ! /* to avoid warning */ return NULL; } /* * cha_malloc() */ ! void *cha_malloc(size_t n) { void *p; if ((p = malloc(n)) == NULL) ! cha_exit_perror("malloc"); return p; } ! void *cha_realloc(void *ptr, size_t n) { void *p; if ((p = realloc(ptr, n)) == NULL) ! cha_exit_perror("realloc"); return p; } #define CHA_MALLOC_SIZE (1024 * 64) ! static char *cha_malloc_char(int size) { static int idx = CHA_MALLOC_SIZE; static char *ptr; if (idx + size >= CHA_MALLOC_SIZE) { ! ptr = (char *)cha_malloc(CHA_MALLOC_SIZE); idx = 0; } --- 245,358 ---- * * outputs: * filepathp - file path string ! */ ! FILE * ! cha_fopen_grammar(char *filename, char *mode, int ret, int dir, ! char **filepathp) { FILE *fp; *filepathp = filename; switch (dir) { ! case 0: ! /* ! * カレントディレクトリから読み込む ! */ return cha_fopen(filename, mode, ret); ! case 2: ! /* ! * カレントディレクトリから読み込む ! */ if ((fp = cha_fopen(filename, mode, -1)) != NULL) ! return fp; ! /* ! * FALLTHRU ! */ ! default: /* should be 1 */ ! /* ! * 文法ディレクトリから読み込む ! * 文法ディレクトリが設定されていなければ .chasenrc を読み込む ! */ if (grammar_dir[0] == '\0') ! cha_read_grammar_dir(); sprintf(filepath, "%s%s", grammar_dir, filename); *filepathp = filepath; return cha_fopen(filepath, mode, ret); } } ! FILE * ! cha_fopen_grammar2(char *filename1, char *filename2, char *mode, int ret, ! int dir, char **filepathp) { FILE *fp; if (dir == 2) { ! if ((fp = ! cha_fopen_grammar(filename1, mode, -1, 0, filepathp)) != NULL) ! return fp; ! if ((fp = ! cha_fopen_grammar(filename2, mode, -1, 0, filepathp)) != NULL) ! return fp; ! if ((fp = ! cha_fopen_grammar(filename1, mode, -1, 1, filepathp)) != NULL) ! return fp; ! if ((fp = ! cha_fopen_grammar(filename2, mode, -1, 1, filepathp)) != NULL) ! return fp; } else { ! if ((fp = ! cha_fopen_grammar(filename1, mode, -1, dir, ! filepathp)) != NULL) ! return fp; ! if ((fp = ! cha_fopen_grammar(filename2, mode, -1, dir, ! filepathp)) != NULL) ! return fp; } cha_exit(ret, "can't open %s or %s", filename1, filename2); ! /* ! * to avoid warning ! */ return NULL; } /* * cha_malloc() */ ! void * ! cha_malloc(size_t n) { void *p; if ((p = malloc(n)) == NULL) ! cha_exit_perror("malloc"); return p; } ! void * ! cha_realloc(void *ptr, size_t n) { void *p; if ((p = realloc(ptr, n)) == NULL) ! cha_exit_perror("realloc"); return p; } #define CHA_MALLOC_SIZE (1024 * 64) ! static char * ! cha_malloc_char(int size) { static int idx = CHA_MALLOC_SIZE; static char *ptr; if (idx + size >= CHA_MALLOC_SIZE) { ! ptr = (char *) cha_malloc(CHA_MALLOC_SIZE); idx = 0; } *************** *** 318,324 **** return ptr + idx - size; } ! char *cha_strdup(char *str) { char *newstr; --- 360,367 ---- return ptr + idx - size; } ! char * ! cha_strdup(char *str) { char *newstr; *************** *** 331,472 **** /* * cha_exit() - print error messages on stderr and exit */ ! void cha_set_stderr(FILE *fp) { cha_stderr = fp; } ! void cha_exit(int status, char *format, ...) { va_list ap; if (Cha_errno) ! return; if (!cha_stderr) ! cha_stderr = stderr; else if (cha_stderr != stderr) ! fputs("500 ", cha_stderr); if (progpath) ! fprintf(cha_stderr, "%s: ", progpath); va_start(ap, format); vfprintf(cha_stderr, format, ap); va_end(ap); if (status >= 0) { fputc('\n', cha_stderr); if (cha_stderr == stderr) ! exit(status); Cha_errno = 1; } } ! void cha_exit_file(int status, char *format, ...) { va_list ap; if (Cha_errno) ! return; if (!cha_stderr) ! cha_stderr = stderr; else if (cha_stderr != stderr) ! fputs("500 ", cha_stderr); if (progpath) ! fprintf(cha_stderr, "%s: ", progpath); if (Cha_lineno == 0) ! ; /* do nothing */ else if (Cha_lineno == Cha_lineno_error) ! fprintf(cha_stderr, "%s:%d: ", filepath, Cha_lineno); else ! fprintf(cha_stderr, "%s:%d-%d: ", filepath, Cha_lineno_error, Cha_lineno); va_start(ap, format); vfprintf(cha_stderr, format, ap); va_end(ap); if (status >= 0) { ! fputc('\n', cha_stderr); if (cha_stderr == stderr) ! exit(status); Cha_errno = 1; } } ! void cha_perror(char *s) { cha_exit(-1, ""); perror(s); } ! void cha_exit_perror(char *s) { cha_perror(s); exit(1); } ! FILE *cha_fopen_rcfile(void) { FILE *fp; char *home_dir, *rc_env, *getenv(); ! /* -R option (standard alone) */ if (!strcmp(chasenrc_path, "*")) { ! /* RCPATH in rcpath.h */ strcpy(chasenrc_path, RCPATH); if ((fp = cha_fopen(chasenrc_path, "r", -1)) != NULL) ! return fp; cha_exit(1, "can't open %s", chasenrc_path); } ! /* -r option */ if (chasenrc_path[0]) ! return cha_fopen(chasenrc_path, "r", 1); ! /* environment variable CHASENRC */ if ((rc_env = getenv("CHASENRC")) != NULL) { strcpy(chasenrc_path, rc_env); return cha_fopen(chasenrc_path, "r", 1); } ! /* .chasenrc in the home directory */ if ((home_dir = getenv("HOME")) != NULL) { ! /* .chasenrc */ sprintf(chasenrc_path, "%s%s", home_dir, RC2FILE); if ((fp = cha_fopen(chasenrc_path, "r", -1)) != NULL) ! return fp; sprintf(chasenrc_path, "%s%s", home_dir, RCFILE); if ((fp = cha_fopen(chasenrc_path, "r", -1)) != NULL) ! return fp; } #ifdef PATHTYPE_MSDOS else if ((home_dir = getenv("HOMEDRIVE")) != NULL) { ! sprintf(chasenrc_path, "%s%s%s", home_dir, getenv("HOMEPATH"), RC2FILE); if ((fp = cha_fopen(chasenrc_path, "r", -1)) != NULL) ! return fp; ! sprintf(chasenrc_path, "%s%s%s", home_dir, getenv("HOMEPATH"), RCFILE); if ((fp = cha_fopen(chasenrc_path, "r", -1)) != NULL) ! return fp; } - #endif - - #ifdef PATHTYPE_MSDOS strcpy(chasenrc_path, progpath); sprintf(strrchr(chasenrc_path, PATH_DELIMITER) + 1, "dic%s", RC2FILE); if ((fp = cha_fopen(chasenrc_path, "r", -1)) != NULL) ! return fp; strcpy(chasenrc_path, progpath); sprintf(strrchr(chasenrc_path, PATH_DELIMITER) + 1, "dic%s", RCFILE); if ((fp = cha_fopen(chasenrc_path, "r", -1)) != NULL) ! return fp; ! #endif ! /* RCPATH in rcpath.h */ strcpy(chasenrc_path, RCPATH); if ((fp = cha_fopen(chasenrc_path, "r", -1)) != NULL) ! return fp; #ifdef PATHTYPE_MSDOS cha_exit(1, "can't open chasenrc or %s", chasenrc_path); --- 374,535 ---- /* * cha_exit() - print error messages on stderr and exit */ ! void ! cha_set_stderr(FILE * fp) { cha_stderr = fp; } ! void ! cha_exit(int status, char *format, ...) { va_list ap; if (Cha_errno) ! return; if (!cha_stderr) ! cha_stderr = stderr; else if (cha_stderr != stderr) ! fputs("500 ", cha_stderr); if (progpath) ! fprintf(cha_stderr, "%s: ", progpath); va_start(ap, format); vfprintf(cha_stderr, format, ap); va_end(ap); if (status >= 0) { fputc('\n', cha_stderr); if (cha_stderr == stderr) ! exit(status); Cha_errno = 1; } } ! void ! cha_exit_file(int status, char *format, ...) { va_list ap; if (Cha_errno) ! return; if (!cha_stderr) ! cha_stderr = stderr; else if (cha_stderr != stderr) ! fputs("500 ", cha_stderr); if (progpath) ! fprintf(cha_stderr, "%s: ", progpath); if (Cha_lineno == 0) ! ; /* do nothing */ else if (Cha_lineno == Cha_lineno_error) ! fprintf(cha_stderr, "%s:%d: ", filepath, Cha_lineno); else ! fprintf(cha_stderr, "%s:%d-%d: ", filepath, Cha_lineno_error, ! Cha_lineno); va_start(ap, format); vfprintf(cha_stderr, format, ap); va_end(ap); if (status >= 0) { ! fputc('\n', cha_stderr); if (cha_stderr == stderr) ! exit(status); Cha_errno = 1; } } ! void ! cha_perror(char *s) { cha_exit(-1, ""); perror(s); } ! void ! cha_exit_perror(char *s) { cha_perror(s); exit(1); } ! FILE * ! cha_fopen_rcfile(void) { FILE *fp; char *home_dir, *rc_env, *getenv(); ! /* ! * -R option (standard alone) ! */ if (!strcmp(chasenrc_path, "*")) { ! /* ! * RCPATH in rcpath.h ! */ strcpy(chasenrc_path, RCPATH); if ((fp = cha_fopen(chasenrc_path, "r", -1)) != NULL) ! return fp; cha_exit(1, "can't open %s", chasenrc_path); } ! /* ! * -r option ! */ if (chasenrc_path[0]) ! return cha_fopen(chasenrc_path, "r", 1); ! /* ! * environment variable CHASENRC ! */ if ((rc_env = getenv("CHASENRC")) != NULL) { strcpy(chasenrc_path, rc_env); return cha_fopen(chasenrc_path, "r", 1); } ! /* ! * .chasenrc in the home directory ! */ if ((home_dir = getenv("HOME")) != NULL) { ! /* ! * .chasenrc ! */ sprintf(chasenrc_path, "%s%s", home_dir, RC2FILE); if ((fp = cha_fopen(chasenrc_path, "r", -1)) != NULL) ! return fp; sprintf(chasenrc_path, "%s%s", home_dir, RCFILE); if ((fp = cha_fopen(chasenrc_path, "r", -1)) != NULL) ! return fp; } #ifdef PATHTYPE_MSDOS else if ((home_dir = getenv("HOMEDRIVE")) != NULL) { ! sprintf(chasenrc_path, "%s%s%s", home_dir, getenv("HOMEPATH"), ! RC2FILE); if ((fp = cha_fopen(chasenrc_path, "r", -1)) != NULL) ! return fp; ! sprintf(chasenrc_path, "%s%s%s", home_dir, getenv("HOMEPATH"), ! RCFILE); if ((fp = cha_fopen(chasenrc_path, "r", -1)) != NULL) ! return fp; } strcpy(chasenrc_path, progpath); sprintf(strrchr(chasenrc_path, PATH_DELIMITER) + 1, "dic%s", RC2FILE); if ((fp = cha_fopen(chasenrc_path, "r", -1)) != NULL) ! return fp; strcpy(chasenrc_path, progpath); sprintf(strrchr(chasenrc_path, PATH_DELIMITER) + 1, "dic%s", RCFILE); if ((fp = cha_fopen(chasenrc_path, "r", -1)) != NULL) ! return fp; ! #endif /* PATHTYPE_MSDOS */ ! /* ! * RCPATH in rcpath.h ! */ strcpy(chasenrc_path, RCPATH); if ((fp = cha_fopen(chasenrc_path, "r", -1)) != NULL) ! return fp; #ifdef PATHTYPE_MSDOS cha_exit(1, "can't open chasenrc or %s", chasenrc_path); *************** *** 474,487 **** cha_exit(1, "can't open .chasenrc, .jumanrc, or %s", chasenrc_path); #endif ! /* to avoid warning */ return NULL; } /* * read .chasenrc and set grammar directory */ ! void cha_read_grammar_dir(void) { FILE *fp; chasen_cell_t *cell; --- 537,553 ---- cha_exit(1, "can't open .chasenrc, .jumanrc, or %s", chasenrc_path); #endif ! /* ! * to avoid warning ! */ return NULL; } /* * read .chasenrc and set grammar directory */ ! void ! cha_read_grammar_dir(void) { FILE *fp; chasen_cell_t *cell; *************** *** 507,515 **** char *s; strcpy(grammar_dir, chasenrc_path); if ((s = strrchr(grammar_dir, PATH_DELIMITER)) != NULL) ! s[1] = '\0'; else ! grammar_dir[0] = '\0'; } fclose(fp); --- 573,581 ---- char *s; strcpy(grammar_dir, chasenrc_path); if ((s = strrchr(grammar_dir, PATH_DELIMITER)) != NULL) ! s[1] = '\0'; else ! grammar_dir[0] = '\0'; } fclose(fp); diff -crN chasen-2.2.3/lib/jfgets.c chasen-2.2.4/lib/jfgets.c *** chasen-2.2.3/lib/jfgets.c Fri Feb 23 21:51:34 2001 --- chasen-2.2.4/lib/jfgets.c Thu Mar 8 22:59:23 2001 *************** *** 1,6 **** /* * cha_jfgets.c - fgets() for japaneses ! * by k-chinen@is.aist-nara.ac.jp, 1996. * * Copyright (C) 1996, 1997, 2000, 2001, * Nara Institute of Science and Technology --- 1,6 ---- /* * cha_jfgets.c - fgets() for japaneses ! * by k-chinen@is.aist-nara.ac.jp, 1996. * * Copyright (C) 1996, 1997, 2000, 2001, * Nara Institute of Science and Technology *************** *** 36,46 **** * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * * NOTE: An idea of these procedures are taken from youhcan's jutils.c ! * for wais-japanese ! * ! * for Korean : yosita-h HIRAHIRA 97/03/02 06:33:24 ! ! * $Id: jfgets.c,v 1.6 2001/02/23 12:51:34 kazuma-t Exp $ */ --- 36,44 ---- * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * * NOTE: An idea of these procedures are taken from youhcan's jutils.c ! * for wais-japanese ! * ! * $Id: jfgets.c,v 1.8 2001/03/08 13:59:23 kazuma-t Exp $ */ *************** *** 49,55 **** #define INNER_BUFSIZE 8192 ! /* delimiter for cha_jfgets() */ static char jfgets_delimiter[256] = ".。!?"; void --- 47,55 ---- #define INNER_BUFSIZE 8192 ! /* ! * delimiter for cha_jfgets() ! */ static char jfgets_delimiter[256] = ".。!?"; void *************** *** 58,165 **** strncpy(jfgets_delimiter, delimiter, sizeof(jfgets_delimiter)); } - /* - * pretty printing for debug - */ - int cha_jistoeuc(unsigned char *ibuffer, unsigned char *obuffer) { ! unsigned char *p, *o; ! int level, flag; ! extern int Cha_lang_e; ! ! level = 0; ! flag = 0; ! o = obuffer; ! ! for(p=ibuffer; *p; p++) { ! if(*p==0x1b) { ! level = 1; ! } ! else if(level==1) { ! if(*p=='$') level = 2; /* ESC $ */ ! else if(*p=='(') level = 12; /* ESC ( */ ! else level = 0; ! } ! else if(level) { ! /* Translation */ ! if(level== 2 && *p=='@') flag = 1; /* ESC $ @ */ ! if(level== 2 && *p=='B') flag = 1; /* ESC $ B */ ! if(level==12 && *p=='B') flag = 0; /* ESC ( B */ ! if(level==12 && *p=='J') flag = 0; /* ESC ( J */ ! /* Give up to parse escape sequence */ ! level=0; ! } ! else if(flag && *p>=0x20) { ! /* KANJI mode without control characters */ ! *o++ = *p++ | 0x80; ! *o++ = *p | 0x80; ! } ! /* ASCII mode or control character in KANJI mode */ ! #if 1 ! /* plural space characters -> single space */ ! else if (Cha_lang_e && (*p == ' ' || *p == '\t')) { ! if (o == obuffer || o[-1] != ' ') ! *o++ = ' '; ! } ! #endif ! else { ! *o++ = *p; } ! } ! *o = '\0'; ! return 0; } /* * isterminator - check it is terminator or not * * return ! * 1: terminator ! * 0: not terminator ! * -1: error */ ! static ! int isterminator(unsigned char *target, unsigned char *termlist) { ! if(termlist==NULL || target==NULL) { return -1; } while (*termlist) { if (*termlist & 0x80) { ! if (*termlist == *target && *(termlist+1) == *(target+1)) ! return 1; termlist += 2; } else { if (*termlist == *target) ! return 1; termlist++; } } return 0; } - - - /* * inner buffer and inner position. * if stream is empty. 'pos' point NULL. * */ ! ! static int iskanji1(unsigned char *str, int idx) { int n; ! for (n = 0; idx >= 0 && str[idx] >= 0x80; n++, idx--) ! ; return n & 1; } --- 58,166 ---- strncpy(jfgets_delimiter, delimiter, sizeof(jfgets_delimiter)); } int cha_jistoeuc(unsigned char *ibuffer, unsigned char *obuffer) { ! unsigned char *p, *o; ! int level, flag; ! level = 0; ! flag = 0; ! o = obuffer; ! ! for (p = ibuffer; *p; p++) { ! if (*p == 0x1b) { ! level = 1; ! } else if (level == 1) { ! if (*p == '$') ! level = 2; /* ESC $ */ ! else if (*p == '(') ! level = 12; /* ESC ( */ ! else ! level = 0; ! } else if (level) { ! /* ! * Translation ! */ ! if (level == 2 && *p == '@') ! flag = 1; /* ESC $ @ */ ! if (level == 2 && *p == 'B') ! flag = 1; /* ESC $ B */ ! if (level == 12 && *p == 'B') ! flag = 0; /* ESC ( B */ ! if (level == 12 && *p == 'J') ! flag = 0; /* ESC ( J */ ! ! /* ! * Give up to parse escape sequence ! */ ! level = 0; ! } else if (flag && *p >= 0x20) { ! /* ! * KANJI mode without control characters ! */ ! *o++ = *p++ | 0x80; ! *o++ = *p | 0x80; ! } ! /* ! * ASCII mode or control character in KANJI mode ! */ ! /* ! * plural space characters -> single space ! */ ! else if (*p == ' ' || *p == '\t') { ! if (o == obuffer || o[-1] != ' ') ! *o++ = ' '; ! } else { ! *o++ = *p; ! } } ! *o = '\0'; ! return 0; } /* * isterminator - check it is terminator or not * * return ! * 1: terminator ! * 0: not terminator ! * -1: error */ ! static int isterminator(unsigned char *target, unsigned char *termlist) { ! if (termlist == NULL || target == NULL) { return -1; } while (*termlist) { if (*termlist & 0x80) { ! if (*termlist == *target && *(termlist + 1) == *(target + 1)) ! return 1; termlist += 2; } else { if (*termlist == *target) ! return 1; termlist++; } } return 0; } /* * inner buffer and inner position. * if stream is empty. 'pos' point NULL. * */ ! static int ! iskanji1(unsigned char *str, int idx) { int n; ! for (n = 0; idx >= 0 && str[idx] >= 0x80; n++, idx--); return n & 1; } *************** *** 167,302 **** /* * cha_fget_line - get line via fgets(). So it is really reading function :-) */ ! char *cha_fget_line(char *buffer, int bufsize, FILE *stream) { ! /* extern variable, ugly... */ ! extern int Cha_server_mode; ! ! static unsigned char tmp_buf[INNER_BUFSIZE], *tmp, kanji1; ! int last; ! tmp = tmp_buf; ! if (kanji1) { ! *tmp++ = kanji1; ! kanji1 = 0; ! } ! ! if(fgets(tmp, bufsize, stream) == NULL) ! return NULL; ! ! /* remove the last extra character */ ! last = strlen(tmp_buf) - 1; ! if (iskanji1(tmp_buf, last)) { ! kanji1 = tmp_buf[last]; ! tmp_buf[last] = 0; ! } ! ! /* for server mode */ ! tmp = tmp_buf; ! if (Cha_server_mode) ! if (tmp_buf[0] == '.' && tmp_buf[1] == '.') ! tmp++; ! ! /* ! * call convertor ! * NOTE: EUC string is short than JIS string. ! * if you want to other conversion, you must care about string length. ! */ ! cha_jistoeuc(tmp, buffer); ! return buffer; ! } /* * cha_jfgets - fgets() for Japanese Text. * */ - char * ! cha_jfgets(char *buffer, int bufsize, FILE *stream) { ! static unsigned char ibuf[INNER_BUFSIZE]; ! static unsigned char *pos=(unsigned char *)""; /* set to the end of line */ ! unsigned char *q; ! int count; ! int kflag; /* kanji flag(0=not found, 1=found) */ ! ! if(pos == NULL) ! return NULL; ! ! kflag = 0; ! q = (unsigned char *)buffer; ! bufsize--; ! ! for (count = bufsize; count > 0; count--) { ! /* line is end without '\n', long string read more */ ! if(*pos == '\0') ! if((pos = cha_fget_line(ibuf, sizeof(ibuf), stream)) == NULL) ! break; ! ! /* KANJI */ ! if(*pos >= 0x80 && *(pos+1)) { ! if (count<2) ! break; ! kflag = 1; ! count--; ! *q++ = *pos++; ! *q++ = *pos++; ! ! /* hit delimiter */ ! if(isterminator(pos - 2, jfgets_delimiter)) { ! if (*pos == '\n') ! pos++; ! break; ! } ! } ! /* not KANJI */ ! else { ! /* line is end */ ! if(*pos == '\n') { ! /* eliminate space characters at the end of line */ ! while(q > (unsigned char *)buffer && (q[-1]==' ' || q[-1]=='\t')) ! q--; ! ! if((pos = cha_fget_line(ibuf, sizeof(ibuf), stream)) == NULL) ! break; ! ! while (*pos == ' ' || *pos == '\t') ! pos++; ! ! /* not have kanji or no space, return with this line */ ! if(count <= 0) ! break; ! ! /* have kanji, connect next line */ ! /* double '\n' is paragraph end. so it is delimiter */ ! if(*pos=='\n') ! break; ! ! /* "ASCII\nASCII" -> "ASCII ASCII" */ ! if (!kflag && !(*pos & 0x80)) ! *q++ = ' '; ! } ! else { ! if (*pos != ' ' && *pos != '\t') ! kflag = 0; ! *q++ = *pos++; ! ! /* hit delimiter */ ! if(isterminator(pos - 1, jfgets_delimiter)) { ! if (*pos == '\n') ! pos++; ! break; ! } ! } ! } ! } ! *q = '\0'; ! return buffer; } --- 168,332 ---- /* * cha_fget_line - get line via fgets(). So it is really reading function :-) */ ! char * ! cha_fget_line(char *buffer, int bufsize, FILE * stream) { ! /* ! * extern variable, ugly... ! */ ! extern int Cha_server_mode; ! ! static unsigned char tmp_buf[INNER_BUFSIZE], *tmp, kanji1; ! int last; ! ! tmp = tmp_buf; ! if (kanji1) { ! *tmp++ = kanji1; ! kanji1 = 0; ! } ! if (fgets(tmp, bufsize, stream) == NULL) ! return NULL; ! /* ! * remove the last extra character ! */ ! last = strlen(tmp_buf) - 1; ! if (iskanji1(tmp_buf, last)) { ! kanji1 = tmp_buf[last]; ! tmp_buf[last] = 0; ! } ! /* ! * for server mode ! */ ! tmp = tmp_buf; ! if (Cha_server_mode) ! if (tmp_buf[0] == '.' && tmp_buf[1] == '.') ! tmp++; ! ! /* ! * call convertor ! * NOTE: EUC string is short than JIS string. ! * if you want to other conversion, ! * you must care about string length. ! */ + cha_jistoeuc(tmp, buffer); + return buffer; + } /* * cha_jfgets - fgets() for Japanese Text. * */ char * ! cha_jfgets(char *buffer, int bufsize, FILE * stream) { ! static unsigned char ibuf[INNER_BUFSIZE]; ! /* set to the end of line */ ! static unsigned char *pos = (unsigned char *) ""; ! unsigned char *q; ! int count; ! int kflag; /* kanji flag(0=not found, 1=found) */ ! ! if (pos == NULL) ! return NULL; ! ! kflag = 0; ! q = (unsigned char *) buffer; ! bufsize--; ! ! for (count = bufsize; count > 0; count--) { ! /* ! * line is end without '\n', long string read more ! */ ! if (*pos == '\0') ! if ((pos = cha_fget_line(ibuf, sizeof(ibuf), stream)) == NULL) ! break; ! ! /* ! * KANJI ! */ ! if (*pos >= 0x80 && *(pos + 1)) { ! if (count < 2) ! break; ! kflag = 1; ! count--; ! *q++ = *pos++; ! *q++ = *pos++; ! ! /* ! * hit delimiter ! */ ! if (isterminator(pos - 2, jfgets_delimiter)) { ! if (*pos == '\n') ! pos++; ! break; ! } ! } ! /* ! * not KANJI ! */ ! else { ! /* ! * line is end ! */ ! if (*pos == '\n') { ! /* ! * eliminate space characters at the end of line ! */ ! while (q > (unsigned char *) buffer ! && (q[-1] == ' ' || q[-1] == '\t')) ! q--; ! ! if ((pos = ! cha_fget_line(ibuf, sizeof(ibuf), stream)) == NULL) ! break; ! ! while (*pos == ' ' || *pos == '\t') ! pos++; ! ! /* ! * not have kanji or no space, return with this line ! */ ! if (count <= 0) ! break; ! ! /* ! * have kanji, connect next line ! */ ! /* ! * double '\n' is paragraph end. so it is delimiter ! */ ! if (*pos == '\n') ! break; ! ! /* ! * "ASCII\nASCII" -> "ASCII ASCII" ! */ ! if (!kflag && !(*pos & 0x80)) ! *q++ = ' '; ! } else { ! if (*pos != ' ' && *pos != '\t') ! kflag = 0; ! *q++ = *pos++; ! ! /* ! * hit delimiter ! */ ! if (isterminator(pos - 1, jfgets_delimiter)) { ! if (*pos == '\n') ! pos++; ! break; ! } ! } ! } ! } ! *q = '\0'; ! return buffer; } diff -crN chasen-2.2.3/lib/katuyou.c chasen-2.2.4/lib/katuyou.c *** chasen-2.2.3/lib/katuyou.c Fri Feb 23 21:51:34 2001 --- chasen-2.2.4/lib/katuyou.c Tue Mar 13 07:05:26 2001 *************** *** 34,41 **** * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * ! * 1990/12/17/Mon Yutaka MYOKI(Nagao Lab., KUEE) ! * $Id: katuyou.c,v 1.6 2001/02/23 12:51:34 kazuma-t Exp $ */ #include "chadic.h" --- 34,41 ---- * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * ! * 1990/12/17/Mon Yutaka MYOKI(Nagao Lab., KUEE) ! * $Id: katuyou.c,v 1.8 2001/03/12 22:05:26 masayu-a Exp $ */ #include "chadic.h" *************** *** 43,52 **** ktype_t Cha_type[TYPE_NUM]; kform_t Cha_form[TYPE_NUM][FORM_NUM]; ! /*********************************************************************** * read_type_form ! ***********************************************************************/ ! static void read_type_form(FILE *fp) { chasen_cell_t *cell1, *cell2; int i, j; --- 43,53 ---- ktype_t Cha_type[TYPE_NUM]; kform_t Cha_form[TYPE_NUM][FORM_NUM]; ! /* * read_type_form ! */ ! static void ! read_type_form(FILE * fp) { chasen_cell_t *cell1, *cell2; int i, j; *************** *** 57,94 **** Cha_type[i].name = cha_strdup(cha_s_atom(cha_car(cell1))); Cha_type[i].basic = 0; cell1 = cha_car(cha_cdr(cell1)); ! for (j = 1; !nullp(cell2 = cha_car(cell1)); cell1 = cha_cdr(cell1), j++) { ! /* name */ Cha_form[i][j].name = cha_strdup(cha_s_atom(cha_car(cell2))); if (!Cha_type[i].basic && ! strmatch2(Cha_form[i][j].name, JSTR_BASIC_FORM, ESTR_BASIC_FORM)) ! Cha_type[i].basic = j; ! /* gobi */ ! if (strcmp(s = cha_s_atom(cha_car(cell2 = cha_cdr(cell2))), "*") == 0) ! Cha_form[i][j].gobi = ""; else { ! Cha_form[i][j].gobi = cha_strdup(s); ! Cha_form[i][j].gobi_len = strlen(s); #ifdef SJIS ! sjis2euc(Cha_form[i][j].gobi); #endif } ! /* ygobi */ if (nullp(cha_car(cell2 = cha_cdr(cell2)))) ! Cha_form[i][j].ygobi = Cha_form[i][j].gobi; else if (strcmp(s = cha_s_atom(cha_car(cell2)), "*") == 0) ! Cha_form[i][j].ygobi = ""; else { Cha_form[i][j].ygobi = cha_strdup(s); #ifdef SJIS sjis2euc(Cha_form[i][j].ygobi); #endif } ! /* pgobi */ if (nullp(cha_car(cell2 = cha_cdr(cell2)))) ! Cha_form[i][j].pgobi = Cha_form[i][j].ygobi; else if (strcmp(s = cha_s_atom(cha_car(cell2)), "*") == 0) ! Cha_form[i][j].pgobi = ""; else { Cha_form[i][j].pgobi = cha_strdup(s); #ifdef SJIS --- 58,107 ---- Cha_type[i].name = cha_strdup(cha_s_atom(cha_car(cell1))); Cha_type[i].basic = 0; cell1 = cha_car(cha_cdr(cell1)); ! for (j = 1; !nullp(cell2 = cha_car(cell1)); ! cell1 = cha_cdr(cell1), j++) { ! /* ! * name ! */ Cha_form[i][j].name = cha_strdup(cha_s_atom(cha_car(cell2))); if (!Cha_type[i].basic && ! strmatch3(Cha_form[i][j].name, JSTR_BASE_FORM, ! ESTR_BASE_FORM1, ESTR_BASE_FORM2)) ! Cha_type[i].basic = j; ! /* ! * gobi ! */ ! if (strcmp ! (s = ! cha_s_atom(cha_car(cell2 = cha_cdr(cell2))), "*") == 0) ! Cha_form[i][j].gobi = ""; else { ! Cha_form[i][j].gobi = cha_strdup(s); ! Cha_form[i][j].gobi_len = strlen(s); #ifdef SJIS ! sjis2euc(Cha_form[i][j].gobi); #endif } ! /* ! * ygobi ! */ if (nullp(cha_car(cell2 = cha_cdr(cell2)))) ! Cha_form[i][j].ygobi = Cha_form[i][j].gobi; else if (strcmp(s = cha_s_atom(cha_car(cell2)), "*") == 0) ! Cha_form[i][j].ygobi = ""; else { Cha_form[i][j].ygobi = cha_strdup(s); #ifdef SJIS sjis2euc(Cha_form[i][j].ygobi); #endif } ! /* ! * pgobi ! */ if (nullp(cha_car(cell2 = cha_cdr(cell2)))) ! Cha_form[i][j].pgobi = Cha_form[i][j].ygobi; else if (strcmp(s = cha_s_atom(cha_car(cell2)), "*") == 0) ! Cha_form[i][j].pgobi = ""; else { Cha_form[i][j].pgobi = cha_strdup(s); #ifdef SJIS *************** *** 97,122 **** } } if (!Cha_type[i].basic) ! cha_exit_file(1, "no basic form"); } } ! /*********************************************************************** * cha_read_katuyou - read CFORM_FILE and set Cha_form[][] * * inputs: * dir - 0: read from current directory * 1: read from grammar directory * 2: read from current directory or grammar directory ! ***********************************************************************/ ! void cha_read_katuyou(FILE *fp_out, int dir) { FILE *fp; char *filepath; fp = cha_fopen_grammar(CFORM_FILE, "r", 1, dir, &filepath); if (fp_out != NULL) ! fprintf(fp_out, "parsing %s\n", filepath); read_type_form(fp); --- 110,136 ---- } } if (!Cha_type[i].basic) ! cha_exit_file(1, "no basic form"); } } ! /* * cha_read_katuyou - read CFORM_FILE and set Cha_form[][] * * inputs: * dir - 0: read from current directory * 1: read from grammar directory * 2: read from current directory or grammar directory ! */ ! void ! cha_read_katuyou(FILE * fp_out, int dir) { FILE *fp; char *filepath; fp = cha_fopen_grammar(CFORM_FILE, "r", 1, dir, &filepath); if (fp_out != NULL) ! fprintf(fp_out, "parsing %s\n", filepath); read_type_form(fp); diff -crN chasen-2.2.3/lib/lisp.c chasen-2.2.4/lib/lisp.c *** chasen-2.2.3/lib/lisp.c Fri Feb 23 20:50:10 2001 --- chasen-2.2.4/lib/lisp.c Sat Feb 24 15:17:22 2001 *************** *** 37,46 **** * Author: 1990/11/12/Mon Yutaka MYOKI(Nagao Lab., KUEE) * 1990/12/16/Mon Modified special thanks to Itsuki NODA * A.Kitauchi , Apr. 1997 ! * $Id: lisp.c,v 1.5 2001/02/23 11:50:10 kazuma-t Exp $ */ - #include "chadic.h" #define COMMENTCHAR ';' --- 37,45 ---- * Author: 1990/11/12/Mon Yutaka MYOKI(Nagao Lab., KUEE) * 1990/12/16/Mon Modified special thanks to Itsuki NODA * A.Kitauchi , Apr. 1997 ! * $Id: lisp.c,v 1.6 2001/02/24 06:17:22 kazuma-t Exp $ */ #include "chadic.h" #define COMMENTCHAR ';' *************** *** 53,66 **** #define new_cell() (cha_cons(NIL, NIL)) #define eq(x, y) (x == y) ! static int (*cha_getc)(); ! static int (*cha_ungetc)(); static int is_bol = 1; static int c_stacked = EOF; static int c_skip = 0; ! static int cha_getc_server(FILE *fp) { int c; --- 52,66 ---- #define new_cell() (cha_cons(NIL, NIL)) #define eq(x, y) (x == y) ! static int (*cha_getc) (); ! static int (*cha_ungetc) (); static int is_bol = 1; static int c_stacked = EOF; static int c_skip = 0; ! static int ! cha_getc_server(FILE * fp) { int c; *************** *** 68,112 **** c = c_stacked; c_stacked = EOF; } else ! c = getc(fp); ! /* skip '\r' */ if (c == '\r') ! c = getc(fp); if (c == '.' && is_bol) { ! /* skip '\r' */ if ((c = getc(fp)) == '\r') ! c = getc(fp); if (c == '\n') ! c = EOF; } is_bol = c == '\n' ? 1 : 0; - #if 0 - putc(c,stdout);fflush(stdout); - #endif - return c; } ! static int cha_ungetc_server(int c, FILE *fp) { c_stacked = c; return c; } ! void cha_set_getc_alone(void) { ! extern int fgetc(FILE*); ! extern int ungetc(int, FILE*); cha_getc = fgetc; cha_ungetc = ungetc; } ! void cha_set_getc_server(void) { cha_getc = cha_getc_server; cha_ungetc = cha_ungetc_server; --- 68,119 ---- c = c_stacked; c_stacked = EOF; } else ! c = getc(fp); ! /* ! * skip '\r' ! */ if (c == '\r') ! c = getc(fp); if (c == '.' && is_bol) { ! /* ! * skip '\r' ! */ if ((c = getc(fp)) == '\r') ! c = getc(fp); if (c == '\n') ! c = EOF; } is_bol = c == '\n' ? 1 : 0; return c; } ! static int ! cha_ungetc_server(int c, FILE * fp) { c_stacked = c; return c; } ! void ! cha_set_getc_alone(void) { ! /* ! * For system having no prototype declarations for the following ! * functions such as SunOS 4.1.4. ! */ ! extern int fgetc(FILE *); ! extern int ungetc(int, FILE *); cha_getc = fgetc; cha_ungetc = ungetc; } ! void ! cha_set_getc_server(void) { cha_getc = cha_getc_server; cha_ungetc = cha_ungetc_server; *************** *** 116,128 **** /* * error_in_lisp */ ! static chasen_cell_t *error_in_lisp(void) { cha_exit_file(1, "premature end of file or string\n"); return NIL; } ! void cha_set_skip_char(int c) { c_skip = c; } --- 123,137 ---- /* * error_in_lisp */ ! static chasen_cell_t * ! error_in_lisp(void) { cha_exit_file(1, "premature end of file or string\n"); return NIL; } ! void ! cha_set_skip_char(int c) { c_skip = c; } *************** *** 131,149 **** * ifnextchar - if next char is return 1, otherwise return 0 */ #define ifnextchar(fp, ch) ifnextchar2(fp, ch, 0) ! static int ifnextchar2(FILE *fp, int ch1, int ch2) { int c; do { c = cha_getc(fp); ! if (c == '\n') Cha_lineno++; } while (c == ' ' || c == '\t' || c == '\n'); if (c == EOF) ! return EOF; if (c == ch1 || (ch2 && c == ch2)) ! return TRUE; cha_ungetc(c, fp); return FALSE; } --- 140,160 ---- * ifnextchar - if next char is return 1, otherwise return 0 */ #define ifnextchar(fp, ch) ifnextchar2(fp, ch, 0) ! static int ! ifnextchar2(FILE * fp, int ch1, int ch2) { int c; do { c = cha_getc(fp); ! if (c == '\n') ! Cha_lineno++; } while (c == ' ' || c == '\t' || c == '\n'); if (c == EOF) ! return EOF; if (c == ch1 || (ch2 && c == ch2)) ! return TRUE; cha_ungetc(c, fp); return FALSE; } *************** *** 151,193 **** /* * skip comment lines */ ! static int skip_comment(FILE *fp) { int n, c; ! while ((n = ifnextchar2(fp, (int)COMMENTCHAR, c_skip)) == TRUE) { while ((c = cha_getc(fp)) != '\n') ! if (c == EOF) ! return c; Cha_lineno++; } return n; } ! int cha_s_feof(FILE *fp) { int c; ! /* init the pointer to output functions */ if (cha_getc == NULL) ! cha_set_getc_alone(); if (Cha_lineno == 0) ! Cha_lineno = 1; Cha_lineno_error = Cha_lineno; for (;;) { if (skip_comment(fp) == EOF) ! return TRUE; if ((c = cha_getc(fp)) == '\n') ! Cha_lineno++; ! else if (c == ' ' || c == '\t') ! ; else { cha_ungetc(c, fp); return FALSE; ! } } } --- 162,207 ---- /* * skip comment lines */ ! static int ! skip_comment(FILE * fp) { int n, c; ! while ((n = ifnextchar2(fp, (int) COMMENTCHAR, c_skip)) == TRUE) { while ((c = cha_getc(fp)) != '\n') ! if (c == EOF) ! return c; Cha_lineno++; } return n; } ! int ! cha_s_feof(FILE * fp) { int c; ! /* ! * init the pointer to output functions ! */ if (cha_getc == NULL) ! cha_set_getc_alone(); if (Cha_lineno == 0) ! Cha_lineno = 1; Cha_lineno_error = Cha_lineno; for (;;) { if (skip_comment(fp) == EOF) ! return TRUE; if ((c = cha_getc(fp)) == '\n') ! Cha_lineno++; ! else if (c == ' ' || c == '\t'); else { cha_ungetc(c, fp); return FALSE; ! } } } *************** *** 195,252 **** * malloc_free_cell() * */ - #define malloc_cell() malloc_free_cell(0) #define free_cell() malloc_free_cell(1) ! static chasen_cell_t *malloc_free_cell(int isfree) { ! static chasen_cell_t *ptr[1024*16]; static int ptr_num = 0; static int idx = CELLALLOCSTEP; if (isfree) { ! /* free */ if (ptr_num > 0) { while (ptr_num > 1) ! free(ptr[--ptr_num]); idx = 0; } return NULL; } else { if (idx == CELLALLOCSTEP) { ! if (ptr_num == 1024*16) ! cha_exit(1, "Can't allocate memory"); ptr[ptr_num++] = cha_malloc(sizeof(chasen_cell_t) * idx); idx = 0; } return ptr[ptr_num - 1] + idx++; } - - #if 0 - return malloc(sizeof(chasen_cell_t)); - #endif } #define CHA_MALLOC_SIZE (1024 * 64) #define free_char() malloc_char(-1) ! static void *malloc_char(int size) { static char *ptr[128]; static int ptr_num = 0; static int idx = CHA_MALLOC_SIZE; if (size < 0) { ! /* free */ if (ptr_num > 0) { while (ptr_num > 1) ! free(ptr[--ptr_num]); idx = 0; } return NULL; } else { if (idx + size >= CHA_MALLOC_SIZE) { if (ptr_num == 128) ! cha_exit(1, "Can't allocate memory"); ptr[ptr_num++] = cha_malloc(CHA_MALLOC_SIZE); idx = 0; } --- 209,267 ---- * malloc_free_cell() * */ #define malloc_cell() malloc_free_cell(0) #define free_cell() malloc_free_cell(1) ! static chasen_cell_t * ! malloc_free_cell(int isfree) { ! static chasen_cell_t *ptr[1024 * 16]; static int ptr_num = 0; static int idx = CELLALLOCSTEP; if (isfree) { ! /* ! * free ! */ if (ptr_num > 0) { while (ptr_num > 1) ! free(ptr[--ptr_num]); idx = 0; } return NULL; } else { if (idx == CELLALLOCSTEP) { ! if (ptr_num == 1024 * 16) ! cha_exit(1, "Can't allocate memory"); ptr[ptr_num++] = cha_malloc(sizeof(chasen_cell_t) * idx); idx = 0; } return ptr[ptr_num - 1] + idx++; } } #define CHA_MALLOC_SIZE (1024 * 64) #define free_char() malloc_char(-1) ! static void * ! malloc_char(int size) { static char *ptr[128]; static int ptr_num = 0; static int idx = CHA_MALLOC_SIZE; if (size < 0) { ! /* ! * free ! */ if (ptr_num > 0) { while (ptr_num > 1) ! free(ptr[--ptr_num]); idx = 0; } return NULL; } else { if (idx + size >= CHA_MALLOC_SIZE) { if (ptr_num == 128) ! cha_exit(1, "Can't allocate memory"); ptr[ptr_num++] = cha_malloc(CHA_MALLOC_SIZE); idx = 0; } *************** *** 255,261 **** } } ! static char *lisp_strdup(char *str) { char *newstr; --- 270,277 ---- } } ! static char * ! lisp_strdup(char *str) { char *newstr; *************** *** 265,293 **** return newstr; } ! void cha_s_free(chasen_cell_t *cell) { - #if 1 free_cell(); free_char(); - #else - if (atomp(cell)) { - free(cell->value.atom); - } else if (consp(cell)) { - cha_s_free(cha_car(cell)); - cha_s_free(cha_cdr(cell)); - } else { - return; - } - - free(cell); - #endif } /* * cha_tmp_atom */ ! chasen_cell_t *cha_tmp_atom(char *atom) { static chasen_cell_t _TmpCell; static chasen_cell_t *TmpCell = &_TmpCell; --- 281,298 ---- return newstr; } ! void ! cha_s_free(chasen_cell_t * cell) { free_cell(); free_char(); } /* * cha_tmp_atom */ ! chasen_cell_t * ! cha_tmp_atom(char *atom) { static chasen_cell_t _TmpCell; static chasen_cell_t *TmpCell = &_TmpCell; *************** *** 301,307 **** /* * cha_cons */ ! chasen_cell_t *cha_cons(void *cha_car, void *cha_cdr) { chasen_cell_t *cell; --- 306,313 ---- /* * cha_cons */ ! chasen_cell_t * ! cha_cons(void *cha_car, void *cha_cdr) { chasen_cell_t *cell; *************** *** 316,330 **** /* * cha_car */ ! chasen_cell_t *cha_car(chasen_cell_t *cell) { if (consp(cell)) ! return car_val(cell); if (nullp(cell)) ! return NIL; ! /* error */ cha_exit_file(1, "%s is not list", cha_s_tostr(cell)); Cha_errno = 1; return NIL; --- 322,339 ---- /* * cha_car */ ! chasen_cell_t * ! cha_car(chasen_cell_t * cell) { if (consp(cell)) ! return car_val(cell); if (nullp(cell)) ! return NIL; ! /* ! * error ! */ cha_exit_file(1, "%s is not list", cha_s_tostr(cell)); Cha_errno = 1; return NIL; *************** *** 333,357 **** /* * cha_cdr */ ! chasen_cell_t *cha_cdr(chasen_cell_t *cell) { if (consp(cell)) ! return cdr_val(cell); if (nullp(cell)) ! return NIL; ! /* error */ cha_exit_file(1, "%s is not list\n", cha_s_tostr(cell)); return NIL; } ! char *cha_s_atom(chasen_cell_t *cell) { if (atomp(cell)) ! return s_atom_val(cell); ! /* error */ cha_exit_file(1, "%s is not atom\n", cha_s_tostr(cell)); return NILSYMBOL; } --- 342,372 ---- /* * cha_cdr */ ! chasen_cell_t * ! cha_cdr(chasen_cell_t * cell) { if (consp(cell)) ! return cdr_val(cell); if (nullp(cell)) ! return NIL; ! /* ! * error ! */ cha_exit_file(1, "%s is not list\n", cha_s_tostr(cell)); return NIL; } ! char * ! cha_s_atom(chasen_cell_t * cell) { if (atomp(cell)) ! return s_atom_val(cell); ! /* ! * error ! */ cha_exit_file(1, "%s is not atom\n", cha_s_tostr(cell)); return NILSYMBOL; } *************** *** 359,397 **** /* * cha_equal */ ! int cha_equal(void *x, void *y) { ! if (eq(x, y)) return TRUE; ! if (nullp(x) || nullp(y)) return FALSE; ! if (s_tag(x) != s_tag(y)) return FALSE; ! if (s_tag(x) == ATOM) return !strcmp(s_atom_val(x), s_atom_val(y)); if (s_tag(x) == CONS) ! return (cha_equal(car_val(x), car_val(y)) && cha_equal(cdr_val(x), cdr_val(y))); return FALSE; } ! int cha_s_length(chasen_cell_t *list) { int i; for (i = 0; consp(list); i++) ! list = cdr_val(list); return i; } ! static int dividing_code_p(int code) { switch (code) { ! case '\n': case '\t': case ';': ! case '(': case ')': case ' ': return 1; ! default: return 0; } } ! static int myscanf(FILE *fp, char *str) { int code; int in_quote = 0; --- 374,425 ---- /* * cha_equal */ ! int ! cha_equal(void *x, void *y) { ! if (eq(x, y)) ! return TRUE; ! if (nullp(x) || nullp(y)) ! return FALSE; ! if (s_tag(x) != s_tag(y)) ! return FALSE; ! if (s_tag(x) == ATOM) ! return !strcmp(s_atom_val(x), s_atom_val(y)); if (s_tag(x) == CONS) ! return (cha_equal(car_val(x), car_val(y)) ! && cha_equal(cdr_val(x), cdr_val(y))); return FALSE; } ! int ! cha_s_length(chasen_cell_t * list) { int i; for (i = 0; consp(list); i++) ! list = cdr_val(list); return i; } ! static int ! dividing_code_p(int code) { switch (code) { ! case '\n': ! case '\t': ! case ';': ! case '(': ! case ')': ! case ' ': return 1; ! default: return 0; } } ! static int ! myscanf(FILE * fp, char *str) { int code; int in_quote = 0; *************** *** 406,418 **** for (;;) { if (in_quote) { if (code == EOF) ! return 0; if (code == in_quote) ! break; } else { if (dividing_code_p(code) || code == EOF) { if (s == str) ! return 0; cha_ungetc(code, fp); break; } --- 434,446 ---- for (;;) { if (in_quote) { if (code == EOF) ! return 0; if (code == in_quote) ! break; } else { if (dividing_code_p(code) || code == EOF) { if (s == str) ! return 0; cha_ungetc(code, fp); break; } *************** *** 421,436 **** if (code != '\\' || in_quote == '\'') { *s++ = code; if (code & 0x80) ! *s++ = cha_getc(fp); } else { ! if ((code = cha_getc(fp)) == EOF) ! return 0; switch (code) { ! case 't': ! *s++ = '\t'; break; ! case 'n': ! *s++ = '\n'; break; ! default: *s++ = code; } } --- 449,466 ---- if (code != '\\' || in_quote == '\'') { *s++ = code; if (code & 0x80) ! *s++ = cha_getc(fp); } else { ! if ((code = cha_getc(fp)) == EOF) ! return 0; switch (code) { ! case 't': ! *s++ = '\t'; ! break; ! case 'n': ! *s++ = '\n'; ! break; ! default: *s++ = code; } } *************** *** 445,464 **** /* * cha_s_read - read S-expression */ ! ! static chasen_cell_t *s_read_atom(FILE *fp) { chasen_cell_t *cell; char buffer[BUFSIZ]; skip_comment(fp); ! /* changed by kurohashi. */ if (myscanf(fp, buffer) == 0) ! return error_in_lisp(); if (!strcmp(buffer, NILSYMBOL)) ! return NIL; cell = new_cell(); s_tag(cell) = ATOM; --- 475,496 ---- /* * cha_s_read - read S-expression */ ! static chasen_cell_t * ! s_read_atom(FILE * fp) { chasen_cell_t *cell; char buffer[BUFSIZ]; skip_comment(fp); ! /* ! * changed by kurohashi. ! */ if (myscanf(fp, buffer) == 0) ! return error_in_lisp(); if (!strcmp(buffer, NILSYMBOL)) ! return NIL; cell = new_cell(); s_tag(cell) = ATOM; *************** *** 467,530 **** return cell; } ! static chasen_cell_t *s_read_cdr(FILE*); ! static chasen_cell_t *s_read_main(FILE*); ! static chasen_cell_t *s_read_car(FILE *fp) { chasen_cell_t *cell; skip_comment(fp); ! switch (ifnextchar(fp, (int)EPARENTHESIS)) { ! case TRUE: return NIL; ! case FALSE: cell = new_cell(); car_val(cell) = s_read_main(fp); cdr_val(cell) = s_read_cdr(fp); return cell; ! default: /* EOF */ return error_in_lisp(); } } ! static chasen_cell_t *s_read_cdr(FILE *fp) { skip_comment(fp); ! switch (ifnextchar(fp, (int)EPARENTHESIS)) { ! case TRUE: return NIL; ! case FALSE: return s_read_car(fp); ! default: /* EOF */ return error_in_lisp(); } } ! static chasen_cell_t *s_read_main(FILE *fp) { ! /* skip_comment(fp); */ ! ! switch (ifnextchar(fp, (int)BPARENTHESIS)) { ! case TRUE: return s_read_car(fp); ! case FALSE: return s_read_atom(fp); ! default: /* EOF */ return error_in_lisp(); } } ! chasen_cell_t *cha_s_read(FILE *fp) { ! /* init the pointer to output functions */ if (cha_getc == NULL) ! cha_set_getc_alone(); if (Cha_lineno == 0) ! Cha_lineno = 1; Cha_lineno_error = Cha_lineno; return s_read_main(fp); --- 499,569 ---- return cell; } ! static chasen_cell_t *s_read_cdr(FILE *); ! static chasen_cell_t *s_read_main(FILE *); ! static chasen_cell_t * ! s_read_car(FILE * fp) { chasen_cell_t *cell; skip_comment(fp); ! switch (ifnextchar(fp, (int) EPARENTHESIS)) { ! case TRUE: return NIL; ! case FALSE: cell = new_cell(); car_val(cell) = s_read_main(fp); cdr_val(cell) = s_read_cdr(fp); return cell; ! default: /* EOF */ return error_in_lisp(); } } ! static chasen_cell_t * ! s_read_cdr(FILE * fp) { skip_comment(fp); ! switch (ifnextchar(fp, (int) EPARENTHESIS)) { ! case TRUE: return NIL; ! case FALSE: return s_read_car(fp); ! default: /* EOF */ return error_in_lisp(); } } ! static chasen_cell_t * ! s_read_main(FILE * fp) { ! /* ! * skip_comment(fp); ! */ ! switch (ifnextchar(fp, (int) BPARENTHESIS)) { ! case TRUE: return s_read_car(fp); ! case FALSE: return s_read_atom(fp); ! default: /* EOF */ return error_in_lisp(); } } ! chasen_cell_t * ! cha_s_read(FILE * fp) { ! /* ! * init the pointer to output functions ! */ if (cha_getc == NULL) ! cha_set_getc_alone(); if (Cha_lineno == 0) ! Cha_lineno = 1; Cha_lineno_error = Cha_lineno; return s_read_main(fp); *************** *** 533,559 **** /* * cha_assoc */ ! ! chasen_cell_t *cha_assoc(chasen_cell_t *item, chasen_cell_t *alist) { while (!nullp(alist) && !cha_equal(item, (cha_car(cha_car(alist))))) ! alist = cha_cdr(alist); return cha_car(alist); } /* * cha_s_print - pretty print S-expression */ - static char cell_buffer_for_print[8192]; ! static char *s_tostr_main(chasen_cell_t*); ! static void s_puts_to_buffer(char *str) { static int idx = 0; int len; ! /* initialization */ if (str == NULL) { idx = 0; return; --- 572,600 ---- /* * cha_assoc */ ! chasen_cell_t * ! cha_assoc(chasen_cell_t * item, chasen_cell_t * alist) { while (!nullp(alist) && !cha_equal(item, (cha_car(cha_car(alist))))) ! alist = cha_cdr(alist); return cha_car(alist); } /* * cha_s_print - pretty print S-expression */ static char cell_buffer_for_print[8192]; ! static char *s_tostr_main(chasen_cell_t *); ! static void ! s_puts_to_buffer(char *str) { static int idx = 0; int len; ! /* ! * initialization ! */ if (str == NULL) { idx = 0; return; *************** *** 561,567 **** len = strlen(str); if (idx + len >= sizeof(cell_buffer_for_print)) { ! /* str is too long */ idx = sizeof(cell_buffer_for_print); } else { strcpy(cell_buffer_for_print + idx, str); --- 602,610 ---- len = strlen(str); if (idx + len >= sizeof(cell_buffer_for_print)) { ! /* ! * str is too long ! */ idx = sizeof(cell_buffer_for_print); } else { strcpy(cell_buffer_for_print + idx, str); *************** *** 569,575 **** } } ! static void s_tostr_cdr(chasen_cell_t *cell) { if (!nullp(cell)) { if (consp(cell)) { --- 612,619 ---- } } ! static void ! s_tostr_cdr(chasen_cell_t * cell) { if (!nullp(cell)) { if (consp(cell)) { *************** *** 583,604 **** } } ! static char *s_tostr_main(chasen_cell_t *cell) { if (nullp(cell)) ! s_puts_to_buffer(NILSYMBOL); else { switch (s_tag(cell)) { ! case CONS: s_puts_to_buffer("("); s_tostr_main(car_val(cell)); s_tostr_cdr(cdr_val(cell)); s_puts_to_buffer(")"); break; ! case ATOM: s_puts_to_buffer(s_atom_val(cell)); break; ! default: s_puts_to_buffer("INVALID_CELL"); } } --- 627,649 ---- } } ! static char * ! s_tostr_main(chasen_cell_t * cell) { if (nullp(cell)) ! s_puts_to_buffer(NILSYMBOL); else { switch (s_tag(cell)) { ! case CONS: s_puts_to_buffer("("); s_tostr_main(car_val(cell)); s_tostr_cdr(cdr_val(cell)); s_puts_to_buffer(")"); break; ! case ATOM: s_puts_to_buffer(s_atom_val(cell)); break; ! default: s_puts_to_buffer("INVALID_CELL"); } } *************** *** 606,620 **** return cell_buffer_for_print; } ! char *cha_s_tostr(chasen_cell_t *cell) { ! /* initialization */ s_puts_to_buffer(NULL); return s_tostr_main(cell); } ! chasen_cell_t *cha_s_print(FILE *fp, chasen_cell_t *cell) { fputs(cha_s_tostr(cell), fp); return cell; --- 651,669 ---- return cell_buffer_for_print; } ! char * ! cha_s_tostr(chasen_cell_t * cell) { ! /* ! * initialization ! */ s_puts_to_buffer(NULL); return s_tostr_main(cell); } ! chasen_cell_t * ! cha_s_print(FILE * fp, chasen_cell_t * cell) { fputs(cha_s_tostr(cell), fp); return cell; diff -crN chasen-2.2.3/lib/mmap.c chasen-2.2.4/lib/mmap.c *** chasen-2.2.3/lib/mmap.c Wed Feb 14 09:20:52 2001 --- chasen-2.2.4/lib/mmap.c Fri Mar 16 15:13:57 2001 *************** *** 35,76 **** * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ! * $Id: mmap.c,v 1.3 2001/02/14 00:20:52 masayu-a Exp $ */ #include "config.h" ! #if defined _WIN32 && ! defined __CYGWIN__ ! #include ! #include ! #else #include ! #endif /* _WIN32 && ! defined __CYGWIN__ */ ! #include #include #ifdef HAVE_MMAP #include #endif - #include "chadic.h" #if ! defined _WIN32 && ! defined __CYGWIN__ #define O_BINARY 0 #endif ! off_t cha_mmap_file(char *filename, void **map) { int fd; struct stat st; off_t size; ! if ((fd = open(filename, O_RDONLY)) < 0) ! cha_exit_perror(filename); if (fstat(fd, &st) < 0) ! cha_exit_perror(filename); size = st.st_size; #ifdef HAVE_MMAP ! if ((*map = mmap((void *)0, size, PROT_READ, MAP_SHARED, fd, 0)) ! == MAP_FAILED ) { ! cha_exit_perror(filename); } #else *map = cha_malloc(size); --- 35,103 ---- * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ! * $Id: mmap.c,v 1.7 2001/03/16 06:13:57 kazuma-t Exp $ */ #include "config.h" ! ! #ifdef HAVE_UNISTD_H #include ! #endif ! #ifdef HAVE_FCNTL_H ! #include ! #endif ! #ifdef HAVE_SYS_STAT_H #include + #endif + #ifdef HAVE_SYS_TYPES_H + #include + #endif + #ifdef HAVE_SYS_PARAM_H + #include + #endif + #ifdef HAVE_IO_H + #include + #endif + #ifdef HAVE_PROCESS_H + #include + #endif + #ifdef HAVE_WINSOCK2_H + #include + #endif + + #ifdef __MINGW32__ + #undef HAVE_MMAP + #endif #ifdef HAVE_MMAP #include #endif #if ! defined _WIN32 && ! defined __CYGWIN__ #define O_BINARY 0 #endif ! #include "chadic.h" ! ! static off_t ! mmap_file(char *filename, void **map, int prot) { int fd; + int flag = O_RDONLY; struct stat st; off_t size; ! if ((prot & PROT_WRITE) != 0) ! flag = O_RDWR; ! ! if ((fd = open(filename, flag)) < 0) ! cha_exit_perror(filename); if (fstat(fd, &st) < 0) ! cha_exit_perror(filename); size = st.st_size; #ifdef HAVE_MMAP ! if ((*map = mmap((void *) 0, size, prot, MAP_SHARED, fd, 0)) ! == MAP_FAILED) { ! cha_exit_perror(filename); } #else *map = cha_malloc(size); *************** *** 82,88 **** return size; } ! void cha_munmap_file(void *map, off_t size) { #ifdef HAVE_MMAP munmap(map, size); --- 109,128 ---- return size; } ! off_t ! cha_mmap_file(char *filename, void **map) ! { ! return mmap_file(filename, map, PROT_READ); ! } ! ! off_t ! cha_mmap_file_w(char *filename, void **map) ! { ! return mmap_file(filename, map, PROT_READ | PROT_WRITE); ! } ! ! void ! cha_munmap_file(void *map, off_t size) { #ifdef HAVE_MMAP munmap(map, size); diff -crN chasen-2.2.3/lib/parse.c chasen-2.2.4/lib/parse.c *** chasen-2.2.3/lib/parse.c Fri Feb 23 21:51:34 2001 --- chasen-2.2.4/lib/parse.c Fri Mar 16 06:25:48 2001 *************** *** 36,47 **** * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * Modified by: A.Kitauchi , Oct. 1996 ! * $Id: parse.c,v 1.19 2001/02/23 12:51:34 kazuma-t Exp $ */ #include "chalib.h" #include "pat.h" #include "sufary.h" #define MRPH_NUM 1024 #define PATH1_NUM 256 --- 36,48 ---- * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * Modified by: A.Kitauchi , Oct. 1996 ! * $Id: parse.c,v 1.24 2001/03/15 21:25:48 masayu-a Exp $ */ #include "chalib.h" #include "pat.h" #include "sufary.h" + #include "tokenizer.h" #define MRPH_NUM 1024 #define PATH1_NUM 256 *************** *** 65,102 **** int Cha_path_num; ! /*********************************************************************** * malloc_chars ! ***********************************************************************/ #define CHA_MALLOC_SIZE (1024 * 64) #define malloc_char(n) malloc_chars(1, n) #define malloc_short(n) malloc_chars(2, n) #define malloc_int(n) malloc_chars(4, n) #define free_chars() malloc_chars(0, 0) ! static void *malloc_chars(int size, int nitems) { static char *buffer_ptr[128]; static int buffer_ptr_num = 0; static int buffer_idx = CHA_MALLOC_SIZE; if (nitems == 0) { ! /* free */ if (buffer_ptr_num > 0) { while (buffer_ptr_num > 1) ! free(buffer_ptr[--buffer_ptr_num]); buffer_idx = 0; } return NULL; } else { if (size > 1) { ! /* size で割りきれる値に補正する */ ! buffer_idx+= size - (buffer_idx & (size - 1)); nitems *= size; } if (buffer_idx + nitems >= CHA_MALLOC_SIZE) { if (buffer_ptr_num == 128) ! cha_exit(1, "Can't allocate memory"); buffer_ptr[buffer_ptr_num++] = cha_malloc(CHA_MALLOC_SIZE); buffer_idx = 0; } --- 66,108 ---- int Cha_path_num; ! /* * malloc_chars ! */ #define CHA_MALLOC_SIZE (1024 * 64) #define malloc_char(n) malloc_chars(1, n) #define malloc_short(n) malloc_chars(2, n) #define malloc_int(n) malloc_chars(4, n) #define free_chars() malloc_chars(0, 0) ! static void * ! malloc_chars(int size, int nitems) { static char *buffer_ptr[128]; static int buffer_ptr_num = 0; static int buffer_idx = CHA_MALLOC_SIZE; if (nitems == 0) { ! /* ! * free ! */ if (buffer_ptr_num > 0) { while (buffer_ptr_num > 1) ! free(buffer_ptr[--buffer_ptr_num]); buffer_idx = 0; } return NULL; } else { if (size > 1) { ! /* ! * size で割りきれる値に補正する ! */ ! buffer_idx += size - (buffer_idx & (size - 1)); nitems *= size; } if (buffer_idx + nitems >= CHA_MALLOC_SIZE) { if (buffer_ptr_num == 128) ! cha_exit(1, "Can't allocate memory"); buffer_ptr[buffer_ptr_num++] = cha_malloc(CHA_MALLOC_SIZE); buffer_idx = 0; } *************** *** 106,150 **** } } ! static void *malloc_free_block(void *ptr, int *nblockp, int size, int do_free) { if (do_free) { ! /* free and malloc one block */ if (*nblockp > 1) { - #if 0 - printf("# free block (%d)\n",size); fflush(stdout); - #endif free(ptr); *nblockp = 0; } if (*nblockp == 0) ! ptr = malloc_free_block(ptr, nblockp, size, 0); } else { ! /* realloc one block larger */ if (*nblockp == 0) ! ptr = malloc(size * ++*nblockp); else { - #if 1 ptr = realloc(ptr, size * ++*nblockp); - #else - { - char *ptr2; - ptr2 = cha_malloc(size * (*nblockp + 1)); - memcpy(ptr2, ptr, size * *nblockp); - (*nblockp)++; - free(ptr); - ptr = ptr2; - } - #endif - #if 0 - printf("# %s block (%d*%d)\n",*nblockp?"realloc":"malloc",size,*nblockp); - fflush(stdout); - #endif } - #if 0 - if (ptr == NULL) - printf("# Can't allocate memory"); fflush(stdout); - #endif } return ptr; --- 112,139 ---- } } ! static void * ! malloc_free_block(void *ptr, int *nblockp, int size, int do_free) { if (do_free) { ! /* ! * free and malloc one block ! */ if (*nblockp > 1) { free(ptr); *nblockp = 0; } if (*nblockp == 0) ! ptr = malloc_free_block(ptr, nblockp, size, 0); } else { ! /* ! * realloc one block larger ! */ if (*nblockp == 0) ! ptr = malloc(size * ++*nblockp); else { ptr = realloc(ptr, size * ++*nblockp); } } return ptr; *************** *** 152,308 **** #define malloc_path() malloc_free_path(0) #define free_path() malloc_free_path(1) ! static int malloc_free_path(int do_free) { static int nblock = 0; ! #if 0 ! printf("# path %d:%d ", nblock, Cha_path_num); ! #endif ! ! Cha_path = malloc_free_block((void *)Cha_path, &nblock, ! sizeof(path_t) * CHA_PATH_NUM, do_free); return Cha_path == NULL; } #define malloc_mrph() malloc_free_mrph(0) #define free_mrph() malloc_free_mrph(1) ! static int malloc_free_mrph(int do_free) { static int nblock = 0; ! #if 0 ! printf("# mrph %d ", nblock); ! #endif ! Cha_mrph = malloc_free_block((void *)Cha_mrph, &nblock, ! sizeof(mrph2_t) * MRPH_NUM, do_free); return Cha_mrph == NULL; } ! #if 0 ! /*********************************************************************** ! * check_code() ! ***********************************************************************/ ! static int check_code(char *str) ! { ! int code; ! unsigned char *s = (unsigned char *)str; ! ! /* nyuuryoku chuuni hankaku space wo yurusu, by. T.U. '96.01.10 */ ! #if 1 ! if (*s == '\0' || *s == ' ' || *s == '\r' || *s == '\n') ! return 0; ! #else ! if (*s == '\0') ! return 0; ! #endif ! else if (*s < HANKAKU) ! return HANKAKU; ! else if (*(s+1) < HANKAKU) ! return ILLEGAL; ! ! code = *s * 256 + *(s + 1); ! ! if (code == PRIOD) return PRIOD; ! else if (code == CHOON) return CHOON; ! else if (code < KIGOU) return KIGOU; ! #if 0 ! else if (code < SUJI) return SUJI; ! #endif ! else if (code < ALPH) return ALPH; ! else if (code < HIRAGANA) return HIRAGANA; ! else if (code < KATAKANA) return KATAKANA; ! else if (code < GR) return GR; ! else return KANJI; ! } ! #endif ! ! #if 0 ! /*********************************************************************** ! * undef_mrph_len - 未定義語の長さを調べる ! * ! * ひらがな・漢字・ローマ字など: 1文字 ! * カタカナ: 連続した文字列 ! ************************************************************************/ ! static int undef_mrph_len(char *target) ! { ! int code, next_code; ! int len = 0; ! ! code = check_code(target); ! ! if (code == HIRAGANA || code == KANJI) ! return 2; ! ! do { ! if (code == HANKAKU || code == ILLEGAL) ! len++; ! else ! len += 2; ! next_code = check_code(target + len); ! } while (next_code == code ! || (code == KATAKANA && next_code == CHOON) ! || (code == ALPH && next_code == PRIOD)); ! ! return len; ! } ! #endif ! ! /*********************************************************************** * register_undef_mrph1 - 未定義語をバッファに追加 ! ***********************************************************************/ ! static int register_undef_mrph1(char *target, int mrph_idx, int undef_len, int no) { - #if 0 - int undef_len; - #endif mrph2_t *mrph = &Cha_mrph[mrph_idx]; - #if 0 - undef_len = undef_mrph_len(target); - #endif - - #if 0 - mrph->midasi = (char *)malloc_char(undef_len + 1); - memcpy(mrph->midasi, target, undef_len); - mrph->midasi[undef_len] = '\0'; - #else mrph->midasi = target; - #endif mrph->yomi = ""; mrph->base_length = mrph->length = undef_len; mrph->base = ""; mrph->pron = ""; ! mrph->comp = "\n"; mrph->hinsi = Cha_undef_info[no].hinsi; mrph->con_tbl = Cha_undef_info[no].con_tbl; mrph->ktype = 0; mrph->kform = 0; ! mrph->is_undef = no + 1; /* 未定義語 */ mrph->weight = MRPH_DEFAULT_WEIGHT; ! mrph->info = ""; /* 付加情報は空文字列とする. */ if (++mrph_idx % MRPH_NUM == 0 && malloc_mrph()) - return FALSE; - - return TRUE; - } - - #if 0 - static int register_undef_mrph(char *target, int mrph_idx, int undef_len) - { - int no; - - for (no = 0; no < Cha_undef_info_num; no++) - if (register_undef_mrph1(target, mrph_idx+no, undef_len, no) == FALSE) return FALSE; return TRUE; } - #endif /* * register_mrph - 活用を調べながら形態素をバッファに追加 --- 141,198 ---- #define malloc_path() malloc_free_path(0) #define free_path() malloc_free_path(1) ! static int ! malloc_free_path(int do_free) { static int nblock = 0; ! Cha_path = malloc_free_block((void *) Cha_path, &nblock, ! sizeof(path_t) * CHA_PATH_NUM, do_free); return Cha_path == NULL; } #define malloc_mrph() malloc_free_mrph(0) #define free_mrph() malloc_free_mrph(1) ! static int ! malloc_free_mrph(int do_free) { static int nblock = 0; ! Cha_mrph = malloc_free_block((void *) Cha_mrph, &nblock, ! sizeof(mrph2_t) * MRPH_NUM, do_free); return Cha_mrph == NULL; } ! /* * register_undef_mrph1 - 未定義語をバッファに追加 ! */ ! static int ! register_undef_mrph1(char *target, int mrph_idx, int undef_len, int no) { mrph2_t *mrph = &Cha_mrph[mrph_idx]; mrph->midasi = target; mrph->yomi = ""; mrph->base_length = mrph->length = undef_len; mrph->base = ""; mrph->pron = ""; ! mrph->compound = "\n"; mrph->hinsi = Cha_undef_info[no].hinsi; mrph->con_tbl = Cha_undef_info[no].con_tbl; mrph->ktype = 0; mrph->kform = 0; ! mrph->is_undef = no + 1; /* 未定義語 */ mrph->weight = MRPH_DEFAULT_WEIGHT; ! mrph->info = ""; /* 付加情報は空文字列とする. */ if (++mrph_idx % MRPH_NUM == 0 && malloc_mrph()) return FALSE; return TRUE; } /* * register_mrph - 活用を調べながら形態素をバッファに追加 *************** *** 311,336 **** * If successful, this rutine returns the number of morphs * added to the buffer. If an error occurs, return -1. */ ! static int register_mrph(int mrph_idx) { int new_mrph_idx = mrph_idx; mrph2_t *new_mrph = &Cha_mrph[mrph_idx]; if (!new_mrph->ktype) { ! /* 活用しない */ if (++new_mrph_idx % MRPH_NUM == 0 && malloc_mrph()) ! return -1; } else { ! /* 活用する */ if (new_mrph->kform) { ! /* 語幹なし */ new_mrph->base_length = 0; new_mrph->yomi = ""; new_mrph->pron = ""; if (++new_mrph_idx % MRPH_NUM == 0 && malloc_mrph()) ! return -1; } else { ! /* 語幹あり */ int f; int ktype = new_mrph->ktype; int baselen = new_mrph->length; --- 201,235 ---- * If successful, this rutine returns the number of morphs * added to the buffer. If an error occurs, return -1. */ ! static int ! register_mrph(int mrph_idx) { int new_mrph_idx = mrph_idx; mrph2_t *new_mrph = &Cha_mrph[mrph_idx]; if (!new_mrph->ktype) { ! /* ! * 活用しない ! */ if (++new_mrph_idx % MRPH_NUM == 0 && malloc_mrph()) ! return -1; } else { ! /* ! * 活用する ! */ if (new_mrph->kform) { ! /* ! * 語幹なし ! */ new_mrph->base_length = 0; new_mrph->yomi = ""; new_mrph->pron = ""; if (++new_mrph_idx % MRPH_NUM == 0 && malloc_mrph()) ! return -1; } else { ! /* ! * 語幹あり ! */ int f; int ktype = new_mrph->ktype; int baselen = new_mrph->length; *************** *** 340,353 **** for (f = 1; Cha_form[ktype][f].name; f++) { if (!Cha_form[ktype][f].gobi[0] || (follows[0] == Cha_form[ktype][f].gobi[0] && ! !memcmp(follows, Cha_form[ktype][f].gobi, Cha_form[ktype][f].gobi_len))) { if (new_mrph_idx != new_mrph_idx0) ! *new_mrph = Cha_mrph[new_mrph_idx0]; new_mrph->kform = f; ! new_mrph->length = baselen + Cha_form[ktype][f].gobi_len; new_mrph->con_tbl = con_tbl + f - 1; if (++new_mrph_idx % MRPH_NUM == 0 && malloc_mrph()) ! return -1; new_mrph = &Cha_mrph[new_mrph_idx]; } } --- 239,254 ---- for (f = 1; Cha_form[ktype][f].name; f++) { if (!Cha_form[ktype][f].gobi[0] || (follows[0] == Cha_form[ktype][f].gobi[0] && ! !memcmp(follows, Cha_form[ktype][f].gobi, ! Cha_form[ktype][f].gobi_len))) { if (new_mrph_idx != new_mrph_idx0) ! *new_mrph = Cha_mrph[new_mrph_idx0]; new_mrph->kform = f; ! new_mrph->length = ! baselen + Cha_form[ktype][f].gobi_len; new_mrph->con_tbl = con_tbl + f - 1; if (++new_mrph_idx % MRPH_NUM == 0 && malloc_mrph()) ! return -1; new_mrph = &Cha_mrph[new_mrph_idx]; } } *************** *** 364,390 **** * If successful, this rutine returns the number of morphs * added to the buffer. If an error occurs, return -1. */ ! static int convert_mrphs(char *target, char **dic_buffer, int mrph_idx) { int nmrph; int new_mrph_idx = mrph_idx; char **pbuf; for (pbuf = dic_buffer; *pbuf; pbuf++) { - #if 0 - fprintf(stdout, "line: %s\n", *pbuf); fflush(stdout); - #endif cha_get_mrph_data(&Cha_mrph[new_mrph_idx], *pbuf, target); - #if 0 - printf("%s %d %d %d\n", - Cha_mrph[new_mrph_idx].yomi, - Cha_mrph[new_mrph_idx].hinsi, - Cha_mrph[new_mrph_idx].ktype, - Cha_mrph[new_mrph_idx].kform); - #endif nmrph = register_mrph(new_mrph_idx); if (nmrph < 0) ! return -1; new_mrph_idx += nmrph; } --- 265,282 ---- * If successful, this rutine returns the number of morphs * added to the buffer. If an error occurs, return -1. */ ! static int ! convert_mrphs(char *target, char **dic_buffer, int mrph_idx) { int nmrph; int new_mrph_idx = mrph_idx; char **pbuf; for (pbuf = dic_buffer; *pbuf; pbuf++) { cha_get_mrph_data(&Cha_mrph[new_mrph_idx], *pbuf, target); nmrph = register_mrph(new_mrph_idx); if (nmrph < 0) ! return -1; new_mrph_idx += nmrph; } *************** *** 394,416 **** /* * collect_mrphs_for_pos() */ ! static int collect_mrphs_for_pos(int pos, int *p_idx) { static int p_start; int i, j; j = 0; if (pos == 0) { ! /* new sentence */ p_idx[j++] = 0; p_start = 1; } else { for (i = p_start; i < Cha_path_num; i++) { if (Cha_path[i].end <= pos) { if (i == p_start) ! p_start++; if (Cha_path[i].end == pos) ! p_idx[j++] = i; } } } --- 286,311 ---- /* * collect_mrphs_for_pos() */ ! static int ! collect_mrphs_for_pos(int pos, int *p_idx) { static int p_start; int i, j; j = 0; if (pos == 0) { ! /* ! * new sentence ! */ p_idx[j++] = 0; p_start = 1; } else { for (i = p_start; i < Cha_path_num; i++) { if (Cha_path[i].end <= pos) { if (i == p_start) ! p_start++; if (Cha_path[i].end == pos) ! p_idx[j++] = i; } } } *************** *** 419,448 **** return j; } ! /*********************************************************************** * check_connect() ! ***********************************************************************/ ! static int check_connect(int pos, int m_num, int *p_idx) { ! /* 次状態の値でパスを分類する */ typedef struct _path_cost_t { int min_cost; short min_cost_no; short state; short num; ! int cost[PATH1_NUM]; ! int pno[PATH1_NUM]; } path_cost_t; ! /* ! static short best_start, best_end, best_state; ! static int best_cost; ! */ static path_cost_t pcost[PATH1_NUM]; int pcost_num; mrph2_t *new_mrph; int i, pno, pcostno; ! int haba_cost, con_cost, cost, mrph_cost; int con_tbl, next_state; #ifdef DEBUG --- 314,346 ---- return j; } ! /* * check_connect() ! */ ! static int ! check_connect(int pos, int m_num, int *p_idx) { ! /* ! * 次状態の値でパスを分類する ! */ typedef struct _path_cost_t { int min_cost; short min_cost_no; short state; short num; ! int cost[PATH1_NUM]; ! int pno[PATH1_NUM]; } path_cost_t; ! /* ! * static short best_start, best_end, best_state; static int ! * best_cost; ! */ static path_cost_t pcost[PATH1_NUM]; int pcost_num; mrph2_t *new_mrph; int i, pno, pcostno; ! int haba_cost, con_cost, cost, mrph_cost; int con_tbl, next_state; #ifdef DEBUG *************** *** 455,495 **** pcost_num = 0; for (i = 0; (pno = p_idx[i]) >= 0; i++) { ! #if 0 ! if (i>0&&pcost[0].min_costlength,Cha_path[pno].state,next_state,cost,new_mrph->is_undef); #endif ! /* cost を計算 */ cost = Cha_path[pno].cost + con_cost * Cha_con_cost_weight; ! /* どの pcost に属するか調べる */ for (pcostno = 0; pcostno < pcost_num; pcostno++) ! if (next_state == pcost[pcostno].state) ! break; if (pcostno < pcost_num) { ! /* tricky: when Cha_cost_width is -1, ">-1" means ">=0" */ if (cost - pcost[pcostno].min_cost > Cha_cost_width) ! continue; } else { ! /* 新しい pcost を作る */ pcost_num++; pcost[pcostno].num = 0; pcost[pcostno].state = next_state; pcost[pcostno].min_cost = INT_MAX; } ! /* pcost に登録 */ if (Cha_cost_width < 0) { pcost[pcostno].min_cost = cost; pcost[pcostno].pno[0] = pno; --- 353,403 ---- pcost_num = 0; for (i = 0; (pno = p_idx[i]) >= 0; i++) { ! /* ! * オートマトンを調べて次状態と接続コストを出す ! */ next_state = cha_check_automaton ! (Cha_path[pno].state, con_tbl, Cha_con_cost_undef, &con_cost); ! if (con_cost == -1) ! continue; #ifdef DEBUG ! printf ! ("[%3d, %3d, pos:%d, len:%d, state:%5d,%5d, cost:%d, undef:%d]\n", ! Cha_path[pno].mrph_p, m_num, pos, new_mrph->length, ! Cha_path[pno].state, next_state, cost, new_mrph->is_undef); #endif ! /* ! * cost を計算 ! */ cost = Cha_path[pno].cost + con_cost * Cha_con_cost_weight; ! /* ! * どの pcost に属するか調べる ! */ for (pcostno = 0; pcostno < pcost_num; pcostno++) ! if (next_state == pcost[pcostno].state) ! break; if (pcostno < pcost_num) { ! /* ! * tricky: when Cha_cost_width is -1, ">-1" means ">=0" ! */ if (cost - pcost[pcostno].min_cost > Cha_cost_width) ! continue; } else { ! /* ! * 新しい pcost を作る ! */ pcost_num++; pcost[pcostno].num = 0; pcost[pcostno].state = next_state; pcost[pcostno].min_cost = INT_MAX; } ! /* ! * pcost に登録 ! */ if (Cha_cost_width < 0) { pcost[pcostno].min_cost = cost; pcost[pcostno].pno[0] = pno; *************** *** 505,587 **** } if (pcost_num == 0) ! return TRUE; ! /* 形態素コスト */ if (new_mrph->is_undef) { ! mrph_cost = Cha_undef_info[new_mrph->is_undef-1].cost ! + Cha_undef_info[new_mrph->is_undef-1].cost_step * new_mrph->length / 2; } else { mrph_cost = Cha_hinsi[new_mrph->hinsi].cost; } mrph_cost *= new_mrph->weight * Cha_mrph_cost_weight; ! #if 0 ! if (Cha_path[Cha_path_num].end == pos + new_mrph->length && ! Cha_path[Cha_path_num].state == pcost[pcostno].state && ! Cha_path[Cha_path_num].cost > pcost[pcostno].min_cost + mrph_cost) { ! return TRUE; ! } ! #endif ! ! for (pcostno = 0; pcostno < pcost_num; pcostno++) { ! /* コスト幅におさまっているパスを抜き出す */ ! if (Cha_cost_width < 0) { ! Cha_path[Cha_path_num].path = malloc_int(2); ! Cha_path[Cha_path_num].path[0] = pcost[pcostno].pno[0]; ! Cha_path[Cha_path_num].path[1] = -1; ! } else { ! int npath = 0; ! int path[PATH1_NUM]; ! haba_cost = pcost[pcostno].min_cost + Cha_cost_width; ! path[npath++] = pcost[pcostno].pno[pcost[pcostno].min_cost_no]; ! for (i = 0; i < pcost[pcostno].num; i++) ! if (pcost[pcostno].cost[i] <= haba_cost && i != pcost[pcostno].min_cost_no) path[npath++] = pcost[pcostno].pno[i]; ! path[npath++] = -1; ! memcpy(Cha_path[Cha_path_num].path = malloc_int(npath), ! path, sizeof(int) * npath); ! } ! /* Cha_path に登録 */ ! Cha_path[Cha_path_num].cost = pcost[pcostno].min_cost + mrph_cost; ! Cha_path[Cha_path_num].mrph_p = m_num; ! Cha_path[Cha_path_num].state = pcost[pcostno].state; ! Cha_path[Cha_path_num].start = pos; ! Cha_path[Cha_path_num].end = pos + new_mrph->length; ! #if 0 ! if (Cha_path[Cha_path_num].start < best_start || ! Cha_path[Cha_path_num].end > best_end || ! Cha_path[Cha_path_num].end == best_end && ! Cha_path[Cha_path_num].state == best_state && ! Cha_path[Cha_path_num].cost < best_cost) { ! best_start = Cha_path[Cha_path_num].start; ! best_end = Cha_path[Cha_path_num].end; ! best_state = Cha_path[Cha_path_num].state; ! best_cost = Cha_path[Cha_path_num].cost; ! } ! #endif ! #if 0 ! printf("[%4d,%4d,%5d, %5d]\n",Cha_path[Cha_path_num].start,Cha_path[Cha_path_num].end,Cha_path[Cha_path_num].state,Cha_path[Cha_path_num].cost); ! #endif #ifdef DEBUG ! printf("%3d %3d %5d [p:%d,prev:%d,m:%d,c:%d,pc:%d]\n", ! Cha_path[Cha_path_num].start, Cha_path[Cha_path_num].end, ! Cha_path[Cha_path_num].state, ! Cha_path_num,Cha_path[Cha_path_num].path[0],m_num,pcost[0].cost[i],Cha_path[Cha_path_num].cost); #endif ! if (++Cha_path_num % CHA_PATH_NUM == 0 && malloc_path()) ! return FALSE; ! } return TRUE; } ! static void set_mrph_end(mrph2_t *mrph) { mrph->midasi = mrph->yomi = mrph->info = ""; mrph->base = mrph->pron = ""; ! mrph->comp = "\n"; mrph->base_length = mrph->length = 3; mrph->hinsi = 0; --- 413,482 ---- } if (pcost_num == 0) ! return TRUE; ! /* ! * 形態素コスト ! */ if (new_mrph->is_undef) { ! mrph_cost = Cha_undef_info[new_mrph->is_undef - 1].cost ! + Cha_undef_info[new_mrph->is_undef - ! 1].cost_step * new_mrph->length / 2; } else { mrph_cost = Cha_hinsi[new_mrph->hinsi].cost; } mrph_cost *= new_mrph->weight * Cha_mrph_cost_weight; ! for (pcostno = 0; pcostno < pcost_num; pcostno++) { ! /* ! * コスト幅におさまっているパスを抜き出す ! */ ! if (Cha_cost_width < 0) { ! Cha_path[Cha_path_num].path = malloc_int(2); ! Cha_path[Cha_path_num].path[0] = pcost[pcostno].pno[0]; ! Cha_path[Cha_path_num].path[1] = -1; ! } else { ! int npath = 0; ! int path[PATH1_NUM]; ! haba_cost = pcost[pcostno].min_cost + Cha_cost_width; ! path[npath++] = pcost[pcostno].pno[pcost[pcostno].min_cost_no]; ! for (i = 0; i < pcost[pcostno].num; i++) ! if (pcost[pcostno].cost[i] <= haba_cost ! && i != pcost[pcostno].min_cost_no) path[npath++] = pcost[pcostno].pno[i]; ! path[npath++] = -1; ! memcpy(Cha_path[Cha_path_num].path = malloc_int(npath), ! path, sizeof(int) * npath); ! } ! /* ! * Cha_path に登録 ! */ ! Cha_path[Cha_path_num].cost = pcost[pcostno].min_cost + mrph_cost; ! Cha_path[Cha_path_num].mrph_p = m_num; ! Cha_path[Cha_path_num].state = pcost[pcostno].state; ! Cha_path[Cha_path_num].start = pos; ! Cha_path[Cha_path_num].end = pos + new_mrph->length; #ifdef DEBUG ! printf("%3d %3d %5d [p:%d,prev:%d,m:%d,c:%d,pc:%d]\n", ! Cha_path[Cha_path_num].start, Cha_path[Cha_path_num].end, ! Cha_path[Cha_path_num].state, ! Cha_path_num, Cha_path[Cha_path_num].path[0], m_num, ! pcost[0].cost[i], Cha_path[Cha_path_num].cost); #endif ! if (++Cha_path_num % CHA_PATH_NUM == 0 && malloc_path()) ! return FALSE; ! } return TRUE; } ! static void ! set_mrph_end(mrph2_t * mrph) { mrph->midasi = mrph->yomi = mrph->info = ""; mrph->base = mrph->pron = ""; ! mrph->compound = "\n"; mrph->base_length = mrph->length = 3; mrph->hinsi = 0; *************** *** 592,951 **** mrph->weight = MRPH_DEFAULT_WEIGHT; } ! static int set_mrph_bkugiri(void) { static int bkugiri_num; int h; mrph2_t *mrph; if (Cha_mrph[1].midasi) ! return bkugiri_num; for (h = 0; Cha_hinsi[h].name; h++) { if (!Cha_hinsi[h].bkugiri) ! continue; mrph = &Cha_mrph[++bkugiri_num]; ! /* memset: unnecessary? */ memset(mrph, 0, sizeof(mrph2_t)); mrph->hinsi = h; mrph->con_tbl = cha_check_table_for_undef(h); ! mrph->midasi = mrph->yomi = mrph->base = mrph->pron = Cha_hinsi[h].bkugiri; mrph->info = ""; - #if 0 - mrph->base_length = mrph->length = 0; - mrph->ktype = mrph->kform = 0; - mrph->is_undef = 0; - mrph->weight = 0; - #endif } return bkugiri_num; } ! static int strcmp_anno(char *target) { ! int i; ! ! for (i = 1; Cha_anno_info[i].str1; i++) ! if (!memcmp(target, Cha_anno_info[i].str1, Cha_anno_info[i].len1)) ! return -i; ! return 0; ! } ! ! /* ! * euc_check_undefword_len() ! * ! * type: -n(n=idx of Cha_anno_info) / 0(space) / 1(1byte) / 2(2byte) / 3(3byte) ! * stat: -n(n=idx of Cha_anno_info) / 0(space) / 1(e:[a-zA-Z],j:1byte) ! * / 2(katakana) / 3(alphabet) / 4(otherwise) ! */ ! static void euc_check_undefword_len(char *target, short *undefword_len, ! char *char_type, int target_len) ! { ! unsigned char *t; ! short *ulen0, *ulen; ! char *type; ! int stat0 = 1, stat = 0; ! anno_info *anno = NULL; ! ! memset(undefword_len, 0, target_len * sizeof(short)); ! memset(char_type, 1, target_len + 1); ! ! t = (unsigned char *)target; ! ulen0 = ulen = undefword_len; ! type = char_type; ! ! while ((char *)t < target + target_len) { ! if (stat0 < 0 && ! (anno->len2 == 0 || ! ((char *)t - target > anno->len2 && ! !memcmp(t - anno->len2, anno->str2, anno->len2)))) ! stat0 = 99; ! if (stat0 < 0) { ! ; ! } else if (is_spc(*t)) { ! *type = stat = 0; ! } else if ((stat = strcmp_anno(t)) < 0) { ! *type = stat; ! anno = &Cha_anno_info[-stat]; ! } else if (Cha_lang_e ! ? ((*t>='a' && *t<='z') || (*t>='A' && *t<='Z')) ! : !(*t & 0x80)) { ! /*: (!(*t & 0x80) && !(*t>='0'&&*t<='9'||*t=='.'||*t==','))) {*/ ! /* [a-zA-Z] / [^1-9.,] */ ! stat = 1; ! } else if (/* zenkaku chou-on */ ! (stat0 == 2 && t[0] == 0xa1 && t[1] == 0xbc) || ! /* 0xa5a1-0xa5ff: zenkaku katakana */ ! (t[0] == 0xa5 && t[1] >= 0xa1 && ! !(stat0 != 2 && ! /* small aiueo,tsu,yayuyo,wa and chou-on */ ! ((t[0] == 0xa5 && ! (t[1] == 0xa1 || t[1] == 0xa3 || t[1] == 0xa5 || ! t[1] == 0xa7 || t[1] == 0xa9 || t[1] == 0xc3 || ! t[1] == 0xe3 || t[1] == 0xe5 || t[1] == 0xe7 || ! t[1] == 0xee)) || ! (t[0] == 0xa1 && t[1] == 0xbc))))) { ! stat = 2; ! } else if (t[0] == 0xa3 && t[1] >= 0xc1) { ! /* 0xa3c1-0xa3ff: zenkaku alphabet */ ! stat = 3; ! } else { ! stat = 4; ! stat0 = 99; ! } ! if (stat != stat0) { ! *ulen0 = ulen - ulen0; ! ulen0 = ulen; } ! if (stat < 0 && stat != stat0) { ! int len = anno->len1 + anno->len2; ! if (len > target_len - ((char *)t - target)) ! len = target_len - ((char *)t - target); ! t += len; ! ulen += len; ! type += len; ! } else if (t[0] == 0x8f && (t[1] & 0x80) && (t[2] & 0x80)) { ! *type = 3; ! t += 3; ulen += 3, type += 3; ! } else if ((t[0] & 0x80) && (t[1] & 0x80)) { ! *type = 2; ! t += 2; ulen += 2, type += 2; ! } else { ! t++; ulen++, type++; ! } ! stat0 = stat; } ! *ulen0 = ulen - ulen0; } /* ! * utf8_check_undefword_len() ! * ! * type: -n(n=idx of Cha_anno_info) / ! * 0(space) / 1(1byte) / 2(2byte) / 3(3byte) / 4(4byte) ! * stat: -n(n=idx of Cha_anno_info) / 0(space) / 1(e:[a-zA-Z],j:1byte) ! * / 2(katakana) / 3(alphabet) / 4(otherwise) */ ! static void utf8_check_undefword_len(char *target, short *undefword_len, ! char *char_type, int target_len) { ! unsigned char *t; ! short *ulen0, *ulen; ! char *type; ! int stat0 = 1, stat = 0; ! anno_info *anno = NULL; ! ! memset(undefword_len, 0, target_len * sizeof(short)); ! memset(char_type, 1, target_len + 1); ! ! t = (unsigned char *)target; ! ulen0 = ulen = undefword_len; ! type = char_type; ! ! while ((char *)t < target + target_len) { ! if (stat0 < 0 && ! (anno->len2 == 0 || ! ((char *)t - target > anno->len2 && ! !memcmp(t - anno->len2, anno->str2, anno->len2)))) ! stat0 = 99; ! if (stat0 < 0) { ! ; ! } else if (is_spc(*t)) { ! *type = stat = 0; ! } else if ((stat = strcmp_anno(t)) < 0) { ! *type = stat; ! anno = &Cha_anno_info[-stat]; ! } else if (Cha_lang_e ! ? ((*t>='a' && *t<='z') || (*t>='A' && *t<='Z')) ! : !(*t & 0x80) /* (not in GR) == GL */ ! ) { ! stat = 1; ! } else if (/* KATAKANA-HIRAGANA PROLONGED SOUND MARK ! U+30FC */ ! (stat0 == 2 && ! t[0] == 0xe3 && t[1] == 0x83 && t[2] == 0xbc) || ! /* KATAKANA LETTER (SMALL) [A-VO] ! U+30A1 -- U+30FA ! include VA, VI, VE, VO */ ! ((t[0] == 0xe3 && ! ((t[1] == 0x82 && t[2] >= 0xa1 && t[2] <= 0xbf) || ! (t[1] == 0x83 && t[2] >= 0x80 && t[2] <= 0xBA))) && ! !(stat0 != 2 && ! /* KATAKANA LETTER SMALL AIUEO, TU, YAYUYO, WA and ! KATAKANA-HIRAGANA PROLONGED SOUND MARK ! U+30A1, U+30A3, U+30A5, U+30A7, U+30A9 ! U+30C3, U+30E3, U+30E5, U+30E7, U+30EE ! U+30FC */ ! (t[0] == 0xe3 && ! ((t[1] == 0x82 && ! (t[2] == 0xa1 || t[2] == 0xa3 || t[2] == 0xa5 || ! t[2] == 0xa7 || t[2] == 0xa9)) || ! (t[1] == 0x83 && ! (t[2] == 0x83 || t[2] == 0xa3 || t[2] == 0xa5 || ! t[2] == 0xa7 || t[2] == 0xae || ! t[2] == 0xbc))))))) { ! stat = 2; ! } else if (t[0] == 0xef && ! ((t[1] == 0xbc && t[2] >= 0xa1 && t[2] <= 0xba) || ! (t[1] == 0xbd && t[2] >= 0x81 && t[2] <= 0x9a))) { ! /* FULLWIDTH LATIN (CAPITAL|SMALL) LETTER [A-Z] ! U+FF21 -- U+FF3A, U+FF41 -- U+FF5A */ ! stat = 3; ! } else { ! stat = 4; ! stat0 = 99; ! } ! if (stat != stat0) { ! *ulen0 = ulen - ulen0; ! ulen0 = ulen; ! } ! if (stat < 0 && stat != stat0) { ! int len = anno->len1 + anno->len2; ! if (len > target_len - ((char *)t - target)) ! len = target_len - ((char *)t - target); ! t += len; ! ulen += len; ! type += len; ! } else if ((t[0] & 0xf0) == 0xf0 && ! (t[1] & 0x80) && (t[2] & 0x80) && (t[3] & 0x80)) { ! *type = 4; ! t += 4; ulen += 4, type += 4; ! } else if ((t[0] & 0xe0) == 0xe0 && ! (t[1] & 0x80) && (t[2] & 0x80)) { ! *type = 3; ! t += 3; ulen += 3, type += 3; ! } else if ((t[0] & 0xc0) == 0xc0 && (t[1] & 0x80)) { ! *type = 2; ! t += 2; ulen += 2, type += 2; ! } else { ! t++; ulen++, type++; } - stat0 = stat; } ! *ulen0 = ulen - ulen0; } ! /* ! * iso8859_check_undefword_len() ! * ! * type: -n(n=idx of Cha_anno_info) / ! * 0(space) / 1(1byte) ! * stat: -n(n=idx of Cha_anno_info) / 0(space) / 1(otherwise) ! */ ! static void iso8859_check_undefword_len(char *target, short *undefword_len, ! char *char_type, int target_len) { ! unsigned char *t; ! short *ulen0, *ulen; ! char *type; ! int stat0 = 1, stat = 0; ! anno_info *anno = NULL; ! ! memset(undefword_len, 0, target_len * sizeof(short)); ! memset(char_type, 1, target_len + 1); ! ! t = (unsigned char *)target; ! ulen0 = ulen = undefword_len; ! type = char_type; ! ! while ((char *)t < target + target_len) { ! if (stat0 < 0 && ! (anno->len2 == 0 || ! ((char *)t - target > anno->len2 && ! !memcmp(t - anno->len2, anno->str2, anno->len2)))) ! stat0 = 99; ! if (stat0 < 0) { ! ; ! } else if (is_spc(*t)) { ! *type = stat = 0; ! } else if ((stat = strcmp_anno(t)) < 0) { ! *type = stat; ! anno = &Cha_anno_info[-stat]; ! } else { ! stat = 1; ! } ! ! if (stat != stat0) { ! *ulen0 = ulen - ulen0; ! ulen0 = ulen; ! } ! if (stat < 0 && stat != stat0) { ! int len = anno->len1 + anno->len2; ! if (len > target_len - ((char *)t - target)) ! len = target_len - ((char *)t - target); ! t += len; ! ulen += len; ! type += len; ! } else { ! t++; ulen++, type++; ! } ! stat0 = stat; } ! *ulen0 = ulen - ulen0; } ! static int is_jisx0208_alphabet(unsigned char *t) ! { ! if (Cha_encode == CHA_ENCODE_EUC) { ! return (t[0] == 0xa3 && t[1] >= 0xc1); ! } else if (Cha_encode == CHA_ENCODE_UTF8) { ! return (t[0] == 0xef && ! ((t[1] == 0xbc && t[2] >= 0xa1 && t[2] <= 0xba) || ! (t[1] == 0xbd && t[2] >= 0x81 && t[2] <= 0x9a))); ! } ! return 0; ! } ! /*********************************************************************** * cha_parse_sentence() - 一文を形態素解析する * * return value: * 0 - ok * 1 - no result / too many morphs ! ***********************************************************************/ ! int cha_parse_sentence(char *target, int target_len, int opt_nobk) { ! int i, dic_no; ! int pos, pos_end; ! int path_idx[PATH1_NUM], path_idx_num; ! int mrph_idx, new_mrph_idx; ! int undef_len; ! int bkugiri_num = 0, bk; static int path0 = -1; - char *dic_buffer[256]; - static short undefword_len[CHA_INPUT_SIZE]; - static char char_type[CHA_INPUT_SIZE + 3]; - - if (Cha_encode == CHA_ENCODE_EUC) { - euc_check_undefword_len(target, undefword_len, - char_type, target_len); - } else if (Cha_encode == CHA_ENCODE_EUC) { - utf8_check_undefword_len(target, undefword_len, - char_type, target_len); - } else if (Cha_encode == CHA_ENCODE_ISO8859) { - iso8859_check_undefword_len(target, undefword_len, - char_type, target_len); - } else { - iso8859_check_undefword_len(target, undefword_len, - char_type, target_len); - } ! cha_set_sentence(target, undefword_len, char_type); free_chars(); free_path(); free_mrph(); ! /* 文頭処理 */ Cha_path[0].start = Cha_path[0].end = 0; Cha_path[0].path = &path0; Cha_path[0].cost = 0; --- 487,675 ---- mrph->weight = MRPH_DEFAULT_WEIGHT; } ! static int ! set_mrph_bkugiri(void) { static int bkugiri_num; int h; mrph2_t *mrph; if (Cha_mrph[1].midasi) ! return bkugiri_num; for (h = 0; Cha_hinsi[h].name; h++) { if (!Cha_hinsi[h].bkugiri) ! continue; mrph = &Cha_mrph[++bkugiri_num]; ! /* ! * memset: unnecessary? ! */ memset(mrph, 0, sizeof(mrph2_t)); mrph->hinsi = h; mrph->con_tbl = cha_check_table_for_undef(h); ! mrph->midasi = mrph->yomi = mrph->base = mrph->pron = ! Cha_hinsi[h].bkugiri; mrph->info = ""; } return bkugiri_num; } ! static int ! lookup_dic(char *target, int target_len, int cursor, int new_mrph_idx) { ! int dic_no; ! char *dic_buffer[256]; ! /* ! * 辞書引き(全角文字のみ検索する) EUC only ! */ ! if (Cha_encode == CHASEN_ENCODE_EUC && ! cha_tok_mblen_on_cursor(Cha_tokenizer, cursor) == 2) { ! for (dic_no = 0; dic_no < Pat_ndicfile; dic_no++) { ! int nmrph; ! /* ! * パトリシア木から形態素を検索 ! */ ! pat_search(Pat_dicfile[dic_no], target + cursor, dic_buffer); ! /* ! * 活用させつつ形態素を Cha_mrph に追加 ! */ ! nmrph = convert_mrphs(target + cursor, dic_buffer, ! new_mrph_idx); ! if (nmrph < 0) ! return -1; ! new_mrph_idx += nmrph; } + } ! for (dic_no = 0; dic_no < Suf_ndicfile; dic_no++) { ! int nmrph; ! /* ! * SUFARY ファイルから形態素を検索 ! */ ! sa_common_prefix_search(Suf_dicfile[dic_no], ! target + cursor, target_len - cursor, ! dic_buffer); ! /* ! * 活用させつつ形態素を Cha_mrph に追加 ! */ ! nmrph = convert_mrphs(target + cursor, dic_buffer, new_mrph_idx); ! if (nmrph < 0) ! return -1; ! new_mrph_idx += nmrph; } ! return new_mrph_idx; } + /* ! * 未定義語処理 */ ! static int ! set_undefword(char *target, int cursor, int new_mrph_idx, int mrph_idx, ! int *path_idx) { ! int undef_len; ! int i; ! undef_len = cha_tok_char_type_len(Cha_tokenizer, cursor); ! #if 0 ! printf("# cursor: %d, undef_len: %d\n", cursor, undef_len); ! #endif ! /* ! * 直前のパスとの接続をチェック ! */ ! for (i = mrph_idx; i < new_mrph_idx; i++) { ! /* ! * 未定義語と同じ長さの単語が辞書にあれば未定義語を追加しない ! */ ! if (Cha_con_cost_undef > 0 && Cha_mrph[i].length == undef_len) ! undef_len = 0; ! if (check_connect(cursor, i, path_idx) == FALSE) ! return -1; ! } ! /* ! * 未定義語の追加 ! */ ! if (undef_len > 0) { ! int no; ! for (no = 0; no < Cha_undef_info_num; no++, new_mrph_idx++) { ! if (register_undef_mrph1(target + cursor, new_mrph_idx, ! undef_len, no) == FALSE) ! return -1; ! if (check_connect(cursor, new_mrph_idx, path_idx) == FALSE) ! return -1; ! #if 0 ! printf("path[0]: %d:%d\n", Cha_path_num - 1, ! Cha_path[Cha_path_num - 1].path[0]); ! #endif } } ! return new_mrph_idx; } ! static int ! add_bkugiri(int cursor, int *path_idx, int path_idx_num, int bkugiri_num) { ! int bk; ! for (bk = 0; bk < bkugiri_num; bk++) { ! int path_num; ! path_num = Cha_path_num; ! /* ! * 文節区切りを追加 ! */ ! if (check_connect(cursor, bk + 1, path_idx) == FALSE) ! return -1; ! #if 0 ! printf("PATH: %d: %d -> %d\n", cursor, path_num, Cha_path_num); ! #endif ! /* ! * 追加された path を path_idx に追加 ! */ ! if (Cha_path_num > path_num) ! for (; path_num < Cha_path_num; path_num++) ! path_idx[path_idx_num++] = path_num; ! path_idx[path_idx_num] = -1; } ! return path_idx_num; } ! #define cursor_sep(c, l) \ ! ((!cha_tok_is_jisx0208_latin(Cha_tokenizer,(c), (l))) ? \ ! cha_tok_mblen_on_cursor(Cha_tokenizer, (c)) : \ ! cha_tok_char_type_len(Cha_tokenizer, (c))) ! /* * cha_parse_sentence() - 一文を形態素解析する * * return value: * 0 - ok * 1 - no result / too many morphs ! */ ! int ! cha_parse_sentence(char *target, int target_len, int opt_nobk) { ! int cursor, cursor_end; ! int path_idx[PATH1_NUM], path_idx_num; ! int mrph_idx, new_mrph_idx; ! int bkugiri_num = 0; static int path0 = -1; ! cha_tok_parse(Cha_tokenizer, target, target_len + 1); free_chars(); free_path(); free_mrph(); ! /* ! * 文頭処理 ! */ Cha_path[0].start = Cha_path[0].end = 0; Cha_path[0].path = &path0; Cha_path[0].cost = 0; *************** *** 955,1093 **** Cha_path_num = 1; set_mrph_end(&Cha_mrph[0]); if (!opt_nobk) ! bkugiri_num = set_mrph_bkugiri(); new_mrph_idx = mrph_idx = bkugiri_num + 1; ! for (pos = pos_end = 0; pos < target_len; ! pos += ! ((char_type[pos] >= 2 && !is_jisx0208_alphabet(target + pos)) ! ? char_type[pos] : undefword_len[pos]), ! pos_end = pos) { #if 0 printf("# mrph %d\n", mrph_idx); #endif ! while (char_type[pos] <= 0) ! pos += undefword_len[pos]; ! if (pos == target_len) ! break; ! path_idx_num = collect_mrphs_for_pos(pos_end, path_idx); #if 0 printf("# path_idx_num %d\n", path_idx_num); #endif if (path_idx_num == 0) ! continue; ! ! #if 1 ! for (bk = 0; bk < bkugiri_num; bk++) { ! int path_num; ! path_num = Cha_path_num; ! /* 文節区切りを追加 */ ! if (check_connect(pos, bk + 1, path_idx) == FALSE) ! goto error_end; ! #if 0 ! printf("PATH: %d: %d -> %d\n", pos, path_num, Cha_path_num); ! #endif ! /* 追加された path を path_idx に追加 */ ! if (Cha_path_num > path_num) ! for (; path_num < Cha_path_num; path_num++) ! path_idx[path_idx_num++] = path_num; ! path_idx[path_idx_num] = -1; ! } ! #endif ! ! /* 辞書引き(全角文字のみ検索する) EUC only */ ! if (Cha_encode == CHA_ENCODE_EUC && char_type[pos] == 2) { ! for (dic_no = 0; dic_no < Pat_ndicfile; dic_no++) { ! int nmrph; ! /* パトリシア木から形態素を検索 */ ! pat_search(Pat_dicfile[dic_no], target + pos, ! dic_buffer); ! /* 活用させつつ形態素を Cha_mrph に追加 */ ! nmrph = convert_mrphs(target + pos, dic_buffer, ! new_mrph_idx); ! if (nmrph < 0) ! goto error_end; ! new_mrph_idx += nmrph; ! } ! } ! for (dic_no = 0; dic_no < Suf_ndicfile; dic_no++) { ! int nmrph; ! /* SUFARY ファイルから形態素を検索 */ ! sa_common_prefix_search(Suf_dicfile[dic_no], ! target + pos, char_type + pos, ! dic_buffer); ! /* 活用させつつ形態素を Cha_mrph に追加 */ ! nmrph = convert_mrphs(target + pos, dic_buffer, ! new_mrph_idx); ! if (nmrph < 0) ! goto error_end; ! new_mrph_idx += nmrph; ! } ! ! #if 0 ! /* ! * 未定義連接コストが 0 のときか、単語が1つも辞書引きできなかったら ! * 未定義語を Cha_mrph に追加 ! */ ! if (Cha_con_cost_undef == 0 || mrph_idx == new_mrph_idx) { ! if (register_undef_mrph(target + pos, new_mrph_idx) == FALSE) ! goto error_end; ! new_mrph_idx += Cha_undef_info_num; ! } ! #endif ! ! /* 未定義語処理 */ ! undef_len = undefword_len[pos]; ! #if 0 ! undef_len = undef_mrph_len(target + pos); ! #endif ! ! #if 0 ! printf("# pos: %d, undef_len: %d\n", pos, undef_len); ! #endif ! /* 直前のパスとの接続をチェック */ ! for (i = mrph_idx; i < new_mrph_idx; i++) { ! /* 未定義語と同じ長さの単語が辞書にあれば未定義語を追加しない */ ! if (Cha_con_cost_undef > 0 && Cha_mrph[i].length == undef_len) ! undef_len = 0; ! if (check_connect(pos, i, path_idx) == FALSE) ! goto error_end; ! } ! ! /* 未定義語の追加 */ ! if (undef_len > 0) { ! int no; ! for (no = 0; no < Cha_undef_info_num; no++, new_mrph_idx++) { ! if (register_undef_mrph1(target + pos, new_mrph_idx, ! undef_len, no) == FALSE) ! goto error_end; ! if (check_connect(pos, new_mrph_idx, path_idx) == FALSE) ! goto error_end; ! #if 0 ! printf("path[0]: %d:%d\n",Cha_path_num-1,Cha_path[Cha_path_num-1].path[0]); ! #endif ! } ! } mrph_idx = new_mrph_idx; } ! /* 文末処理 */ set_mrph_end(&Cha_mrph[mrph_idx]); if (++mrph_idx % MRPH_NUM == 0 && malloc_mrph()) ! goto error_end; ! collect_mrphs_for_pos(pos_end, path_idx); ! if (check_connect(pos, mrph_idx - 1, path_idx) == FALSE) ! goto error_end; #ifdef DEBUG for (i = 1; i < mrph_idx - 1; i++) { printf("%4d: %4d ", i, Cha_mrph[i].con_tbl); ! print_mrph(0, &Cha_mrph[i], 'F', "%-11m %-11y %-11P3- %-14T %F \n"); } #endif --- 679,741 ---- Cha_path_num = 1; set_mrph_end(&Cha_mrph[0]); if (!opt_nobk) ! bkugiri_num = set_mrph_bkugiri(); new_mrph_idx = mrph_idx = bkugiri_num + 1; ! for (cursor = cursor_end = 0; cursor <= target_len; ! cursor += cursor_sep(cursor, target_len - cursor)) { #if 0 printf("# mrph %d\n", mrph_idx); #endif ! cursor_end = cursor; ! /* skip annotations and white space */ ! while (cha_tok_anno_type(Cha_tokenizer, cursor) != 0 ) { ! cursor += cha_tok_char_type_len(Cha_tokenizer, cursor); ! if (cursor > target_len) ! break; ! } ! path_idx_num = collect_mrphs_for_pos(cursor_end, path_idx); #if 0 printf("# path_idx_num %d\n", path_idx_num); #endif if (path_idx_num == 0) ! continue; ! path_idx_num = add_bkugiri(cursor, ! path_idx, path_idx_num, bkugiri_num); ! if (path_idx_num < 0) ! goto error_end; ! ! new_mrph_idx = lookup_dic(target, target_len, cursor, new_mrph_idx); ! if (new_mrph_idx < 0) ! goto error_end; ! ! new_mrph_idx = set_undefword(target, cursor, ! new_mrph_idx, mrph_idx, path_idx); ! if (new_mrph_idx < 0) ! goto error_end; mrph_idx = new_mrph_idx; } ! /* ! * 文末処理 ! */ set_mrph_end(&Cha_mrph[mrph_idx]); if (++mrph_idx % MRPH_NUM == 0 && malloc_mrph()) ! goto error_end; ! collect_mrphs_for_pos(cursor_end, path_idx); ! if (check_connect(cursor, mrph_idx - 1, path_idx) == FALSE) ! goto error_end; #ifdef DEBUG for (i = 1; i < mrph_idx - 1; i++) { printf("%4d: %4d ", i, Cha_mrph[i].con_tbl); ! print_mrph(0, &Cha_mrph[i], 'F', ! "%-11m %-11y %-11P3- %-14T %F \n"); } #endif diff -crN chasen-2.2.3/lib/pat.c chasen-2.2.4/lib/pat.c *** chasen-2.2.3/lib/pat.c Wed Feb 14 09:20:53 2001 --- chasen-2.2.4/lib/pat.c Fri Mar 2 09:09:16 2001 *************** *** 35,41 **** * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ! * $Id: pat.c,v 1.11 2001/02/14 00:20:53 masayu-a Exp $ */ #include --- 35,41 ---- * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ! * $Id: pat.c,v 1.15 2001/03/02 00:09:16 masayu-a Exp $ */ #include *************** *** 47,168 **** /* * strcpy_tonl() */ ! static void strcpy_tonl(char *dst, char *src) { ! while ((*dst++ = *src++) != '\n') ! ; } ! static int strcmp_tonl(char *s1, char *s2) { ! for (; *s1 != '\n' && *s1 == *s2; s1++, s2++) ! ; ! return (int)(*s1 - *s2); } ! /**************************************************** ! * pat_bits --- 文字列中の指定された位置のビットを返す ! * ! * パラメータ ! * string --- 文字列 ! * cbit --- 指定された位置。文字列全体を一つのビット列と考え、 ! * 先頭(左)bitから 0,1,2,3... で指定する。 ! * len --- 文字列の長さ.strlenをいちいちやってたんじゃ大変だから 900918 ! * ! * 返し値 ! * 0 / not 0 ! ****************************************************/ ! static int pat_bits(char *string, int cbit, int len) { ! int moji_idx = cbit / 8; /* 指定された位置が何文字目か */ ! #if 0 ! static int bitval[8] = { 0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01 }; ! printf("[%d,%d,%d]",cbit,moji_idx,len); ! if (moji_idx>len){printf("!!!!!!!!!!!!!!!");exit(1);} ! #endif ! /* 指定された位置 >= 文字列の長さのチェック */ ! if (moji_idx >= len) ! return 0; ! /* トップノードのときは1を返す(topからは必ず右) */ ! if (cbit < 0) ! return 1; ! return string[moji_idx] & (1 << (7 - cbit % 8)); } ! static int pat_memcmp(unsigned char *s1, unsigned char *s2, int n) { if (n == 2) ! return (s1[0] != s2[0] || s1[1] != s2[1]); else ! return memcmp(s1,s2,n); } ! /* key の checkbitビット目で左右に振り分け */ #define get_next_node(node, key, checkbit, key_length) \ ((pat_bits((key), (checkbit), (key_length))) ? (node)->right : (node)->left) ! /**************************************************** ! * pat_search --- パトリシア木を検索 ! * ! * パラメータ ! * key --- 検索キー ! * result --- 結果を入れる. ! * ! * 返し値 ! * 検索終了位置(ポインタ) ! * ! ****************************************************/ ! pat_node *pat_search(pat_t *pat, char *key, char **result) { pat_node *top_ptr = pat->root; pat_node *tmp_ptr = NULL; pat_node *ptr = pat->root->right; pat_index_list *list; int checkbit; ! int key_length = strlen(key); /* キーの文字数を数えておく */ ! int match_len = 0; /* 途中でマッチしたPrefixの文字数 */ int result_last = 0; do { checkbit = ptr->checkbit; ! /* 敷居ビットならば */ ! if(checkbit % SIKII_BIT == 0 && checkbit){ /* 途中単語を探す */ tmp_ptr = ptr->left; #ifdef DEBUG printf("\n[%d,%02x%02x]", checkbit, key[0], key[1]); #endif ! /* 先頭の「見出し語」部分だけでマッチングを行なう */ if (!pat_memcmp(key + match_len, ! pat_get_text(pat, ! (tmp_ptr->il).index) ! + match_len, checkbit / 8 - match_len)) { ! /* 見つけた */ ! match_len = checkbit / 8; /* 途中でマッチしたPrefixの文字数 */ ! list = &(tmp_ptr->il); /* 全リスト要素の取り出し */ while (list != NULL) { ! result[result_last++] = pat_get_text(pat,list->index); list = list->next; } ! } else { /* 途中で失敗を発見 */ result[result_last] = NULL; return ptr; } } ! /* key の checkbitビット目で左右に振り分け */ ptr = get_next_node(ptr, key, checkbit, key_length); } while (checkbit < ptr->checkbit); ! if (ptr != tmp_ptr || ptr == top_ptr) { /* 終了ノードをチェックする */ ! char *line = pat_get_text(pat,(ptr->il).index); ! /* bufferの先頭の「見出し語」部分だけでマッチングを行なう */ ! /* いきどまり単語のPrefixチェック */ if (!pat_memcmp(key + match_len, line + match_len, strlen(line) - match_len)) { ! /* 新登場の単語か否かのチェック */ ! if (match_len != key_length) { ! list = &(ptr->il); /* 全リスト要素の取り出し */ ! while(list != NULL){ result[result_last++] = pat_get_text(pat, list->index); list = list->next; } --- 47,203 ---- /* * strcpy_tonl() */ ! static void ! strcpy_tonl(char *dst, char *src) { ! while ((*dst++ = *src++) != '\n'); } ! static int ! strcmp_tonl(char *s1, char *s2) { ! for (; *s1 != '\n' && *s1 == *s2; s1++, s2++); ! return (int) (*s1 - *s2); } ! /* ! * pat_bits --- 文字列中の指定された位置のビットを返す ! * position given for string ! * return the bit of the position ! * ! * parameters: ! * string --- 文字列 ! * string ! * cbit --- 指定された位置。文字列全体を一つのビット列と考え、 ! * 先頭(左)bitから 0,1,2,3... で指定する。 ! * position. ! * string is regarded as a sequence of bits. ! * the first(left) bit is 0. ! * len --- 文字列の長さ.strlenをいちいちやってたんじゃ大変だから ! * the length of string. --- strlen is cumbersome... ! * ! * return: ! * 0 / not 0 ! */ ! static int ! pat_bits(char *string, int cbit, int len) { ! int moji_idx = cbit / 8; /* 指定された位置が何文字目か */ ! /* the position to what number by character */ ! /* static int bitval[8] = */ ! /* { 0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01 }; */ ! /* printf("[%d,%d,%d]", cbit, moji_idx, len); */ ! /* if (moji_idx > len) { */ ! /* printf("!!!!!!!!!!!!!!!"); */ ! /* exit(1); */ ! /* } */ ! ! /* ! * 指定された位置 >= 文字列の長さのチェック ! * check (the position >= the length of string) ! */ ! if (moji_idx >= len) ! return 0; ! /* ! * トップノードのときは1を返す(topからは必ず右) ! * when top node, return 1; ! */ ! if (cbit < 0) ! return 1; ! return string[moji_idx] & (1 << (7 - cbit % 8)); } ! static int ! pat_memcmp(unsigned char *s1, unsigned char *s2, int n) { if (n == 2) ! return (s1[0] != s2[0] || s1[1] != s2[1]); else ! return memcmp(s1, s2, n); } ! /* ! * key の checkbitビット目で左右に振り分け ! * right or left by `checkbit'-th key ! */ #define get_next_node(node, key, checkbit, key_length) \ ((pat_bits((key), (checkbit), (key_length))) ? (node)->right : (node)->left) ! /* ! * pat_search --- search patricia tree ! * ! * parameter ! * key --- 検索キー ! * result --- 結果を入れる. ! * ! * return: ! * pointer to a node which the search end ! */ ! pat_node * ! pat_search(pat_t * pat, char *key, char **result) { pat_node *top_ptr = pat->root; pat_node *tmp_ptr = NULL; pat_node *ptr = pat->root->right; pat_index_list *list; int checkbit; ! int key_length = strlen(key); /* the length of key string */ ! int match_len = 0; /* the length of matched prefix string */ ! int result_last = 0; do { checkbit = ptr->checkbit; ! /* when the SIKIIBIT (= the character segment) */ ! if (checkbit % SIKII_BIT == 0 && checkbit) { /* 途中単語を探す */ /* search word */ tmp_ptr = ptr->left; #ifdef DEBUG printf("\n[%d,%02x%02x]", checkbit, key[0], key[1]); #endif ! /* ! * 先頭の「見出し語」部分だけでマッチングを行なう ! * matching by `surface form' ! */ if (!pat_memcmp(key + match_len, ! pat_get_text(pat, (tmp_ptr->il).index) ! + match_len, checkbit / 8 - match_len)) { /* found! */ ! ! match_len = checkbit / 8; /* the character length of matched prefix */ ! list = &(tmp_ptr->il); /* pick up the all elements */ ! while (list != NULL) { ! result[result_last++] = pat_get_text(pat, list->index); list = list->next; } ! } else { /* not found */ result[result_last] = NULL; return ptr; } } ! /* ! * key の checkbitビット目で左右に振り分け ! * right or left by `checkbit'-th bit of key ! */ ptr = get_next_node(ptr, key, checkbit, key_length); } while (checkbit < ptr->checkbit); ! if (ptr != tmp_ptr || ptr == top_ptr) { /* check the end node or not */ ! char *line = pat_get_text(pat, (ptr->il).index); ! /* ! * bufferの先頭の「見出し語」部分だけでマッチングを行なう ! * matching by `surface form' ! */ ! /* ! * いきどまり単語のPrefixチェック ! * check the prefix of dead end word ! */ if (!pat_memcmp(key + match_len, line + match_len, strlen(line) - match_len)) { ! if (match_len != key_length) { /* new word or not */ ! list = &(ptr->il); /* pick up the all elements in list */ ! ! while (list != NULL) { result[result_last++] = pat_get_text(pat, list->index); list = list->next; } *************** *** 174,197 **** return ptr; } ! ! /**************************************************** ! * pat_search_exact --- パトリシア木を検索(exact match) ! * ! * パラメータ ! * key --- 検索キー ! * x_ptr --- 検索開始位置(ポインタ) ! * result --- 結果を入れる. ! * ! * 返し値 ! * 検索終了位置(ポインタ) ! ****************************************************/ ! pat_node *pat_search_exact(pat_t *pat, char *key, char **result) { pat_node *x_ptr = pat->root; pat_node *ptr; pat_index_list *list; ! int key_length = strlen(key); /* キーの文字数を数えておく */ char *line; int result_last = 0; --- 209,233 ---- return ptr; } ! /* ! * pat_search_exact --- パトリシア木を検索(search patricia tree exact match) ! * ! * parameter ! * key --- 検索キー ! * x_ptr --- 検索開始位置(ポインタ) ! * pointer to position where the search begin ! * result --- 結果を入れる. ! * ! * return: ! * pointer to a node where the search end ! */ ! pat_node * ! pat_search_exact(pat_t * pat, char *key, char **result) { pat_node *x_ptr = pat->root; pat_node *ptr; pat_index_list *list; ! int key_length = strlen(key); /* the length of key */ char *line; int result_last = 0; *************** *** 200,213 **** x_ptr = get_next_node(x_ptr, key, x_ptr->checkbit, key_length); } while (ptr->checkbit < x_ptr->checkbit); ! /* ファイルから取って来る */ line = pat_get_text(pat, (x_ptr->il).index); ! /* bufferの先頭の「見出し語」部分だけでマッチングを行なう */ ! if (strcmp(key, line) == 0){ /* いきどまり単語のチェック */ ! list = &(x_ptr->il); /* 全リスト要素の取り出し */ ! while(list != NULL){ ! line = pat_get_text(pat,list->index); result[result_last++] = line; list = list->next; } --- 236,257 ---- x_ptr = get_next_node(x_ptr, key, x_ptr->checkbit, key_length); } while (ptr->checkbit < x_ptr->checkbit); ! /* ! * ファイルから取って来る ! * get text from file ! */ line = pat_get_text(pat, (x_ptr->il).index); ! /* ! * bufferの先頭の「見出し語」部分だけでマッチングを行なう ! * pattern match by `surface form' which is at the first column in `.int' ! */ ! if (strcmp(key, line) == 0) { /* いきどまり単語のチェック */ ! /* check the dead end word */ ! list = &(x_ptr->il); /* 全リスト要素の取り出し */ ! /* pick up all elements */ ! while (list != NULL) { ! line = pat_get_text(pat, list->index); result[result_last++] = line; list = list->next; } *************** *** 217,240 **** return x_ptr; } ! /**************************************************** ! * pat_search4insert --- 挿入用に検索 ! * ! * パラメータ ! * key --- 検索キー ! * node --- 検索開始位置(ポインタ) ! * ! * 返し値 ! * 検索終了位置(ポインタ) ! * ! * メモ ! * 大域変数 prefix_str の指す先にプレフィックス文字列を入れる。 ! ****************************************************/ ! static pat_node *pat_search4insert(char *key, pat_node *node) { pat_node *tmp_node; ! int key_length = strlen(key); /* キーの文字数を数えておく */ ! do { tmp_node = node; node = get_next_node(node, key, node->checkbit, key_length); --- 261,285 ---- return x_ptr; } ! /* ! * pat_search4insert --- 挿入用に検索 ! * search for insersion ! * ! * parameter ! * key --- 検索キー ! * node --- 検索開始位置(ポインタ) ! * pointer to a position where the search begin ! * ! * return ! * 検索終了位置(ポインタ) ! * pointer to a position where the search end ! */ ! static pat_node * ! pat_search4insert(char *key, pat_node * node) { pat_node *tmp_node; ! int key_length = strlen(key); /* the length of key */ ! do { tmp_node = node; node = get_next_node(node, key, node->checkbit, key_length); *************** *** 243,262 **** return node; } ! ! /**************************************************** ! * pat_insert --- パトリシア木にデータを挿入 ! * ! * パラメータ ! * f --- ファイル ! * line --- データ(挿入キーと内容が区切り文字で区切られている構造) ! * index --- データのファイル上のインデックス ! * x_ptr --- 挿入のための検索の開始位置 ! * ! * 返し値 ! * 無し! ! ****************************************************/ ! void pat_insert(pat_t *pat, char *line, long index) { pat_node *x_ptr = pat->root; pat_node *t_ptr, *p_ptr, *new_ptr; --- 288,311 ---- return node; } ! /* ! * pat_insert --- パトリシア木にデータを挿入 ! * insert data for patricia tree ! * ! * Parameter ! * f --- file ! * line --- データ(挿入キーと内容が区切り文字で区切られている構造) ! * key and contents (segmented by delimiter) ! * index --- データのファイル上のインデックス ! * `index' for data file ! * x_ptr --- 挿入のための検索の開始位置 ! * begining point which the search start ! * ! * return ! * none ! */ ! void ! pat_insert(pat_t * pat, char *line, long index) { pat_node *x_ptr = pat->root; pat_node *t_ptr, *p_ptr, *new_ptr; *************** *** 265,443 **** int buffer_length; int key_length; char key[500]; ! char buffer[50000]; /* 汎用バッファ */ x_ptr = pat->root; ! strcpy(key,line); ! key_length = strlen(key); /* キーの文字数を数えておく */ ! /* キーの探索 */ ! t_ptr = (pat_node*)pat_search4insert(key,x_ptr); ! ! if((t_ptr->il).index >= 0) { ! strcpy_tonl(buffer, pat_get_text(pat,(t_ptr->il).index)); ! ! if(strncmp(key,buffer,strlen(key)) == 0){ /* キーが一致 */ ! /* printf("%s: キーが一致するものがある\n",buffer); ! fflush(stdout); */ list = &(t_ptr->il); ! while(list !=NULL){ ! strcpy_tonl(buffer, pat_get_text(pat,list->index)); ! if(strcmp_tonl(buffer,line)==0){ ! /* if(strncmp(buffer,line,strlen(line))==0){*/ ! /* 全く同じのがあるので挿入せずにリターン */ ! #if 0 ! fprintf(stderr,"%s: 全く同じのがあるので無視\n",buffer); ! fflush(stderr); ! #endif return; } mae_wo_sasu_ptr = list; list = list->next; ! } /* この時点で list はリストの末尾を指す */ ! /* 既にあるキーに内容をさらに挿入する */ ! new_l_ptr = pat_malloc_index_list(); /* indexのlist */ new_l_ptr->index = index; new_l_ptr->next = NULL; mae_wo_sasu_ptr->next = new_l_ptr; return; - } else { /* キーが一致しなかった場合 buffer にその一致しなかったキー */ } } else { /* データの無いノードに落ちた場合: 最初にデータをいれたとき */ ! buffer[0] = buffer[1] = '\0'; /* 16bit */ } ! /* 挿入キーと衝突するキーとの間で ! 最初に異なる bit の位置(diff_bit)を求める */ buffer_length = strlen(buffer); ! for (diff_bit=0; !pat_bits(key, diff_bit, key_length) ! == !pat_bits(buffer, diff_bit, buffer_length); diff_bit++) ! ;/* 空文 */ ! ! ! /* キーを置く位置(x_ptr)を求める。 */ do { p_ptr = x_ptr; ! /* key の checkbitビット目で左右に振り分け */ ! x_ptr = get_next_node(x_ptr, key,x_ptr->checkbit,key_length); ! } while ((x_ptr->checkbit < diff_bit) && (p_ptr->checkbit < x_ptr->checkbit)); ! /* 挿入するノードを生成しキー・検査ビット等を設定する。 */ ! new_ptr = pat_malloc_node(); /* ノード本体 */ ! new_ptr->checkbit = diff_bit; /* チェックビット */ ! (new_ptr->il).index = index; ! (new_ptr->il).next = NULL; ! ! /* 子節と親節を設定する。 */ ! /* ビットが1なら右リンクがキーのある位置を指す。0なら左リンク。 */ ! if (pat_bits(key, new_ptr->checkbit, key_length)){ new_ptr->right = new_ptr; new_ptr->left = x_ptr; } else { new_ptr->left = new_ptr; new_ptr->right = x_ptr; } ! /* ビットが1なら、親の右につなぐ。0なら左。 */ ! if (pat_bits(key,p_ptr->checkbit, key_length)) p_ptr->right = new_ptr; else p_ptr->left = new_ptr; return; - } - - /* This function is broken. */ - /**************************************************** - * pat_show_patfile --- パトリシア木データを出力 - * - * パラメータ - * top_ptr --- 検索開始ノードの位置(ポインタ) - * out_to --- 出力先(stdoutやファイル) - * - * 返し値 - * 無し。パトリシア木データを出力。 - ****************************************************/ - void pat_show_patfile(pat_t *pat, FILE *out_to, char *prefix) - { - #if 0 - long idx = -1; - pat_index_list *t_ptr; - char word[100]; - char pftmp[100]; - char prefix_keep[100]; - pat_node top_ptr = pat->root; - - word[0] = '\0'; - - strcpy(prefix_keep,prefix); - - /* 敷居ビットのとき */ - if(top_ptr->checkbit % SIKII_BIT == 0 && top_ptr->checkbit != 0){ - strcpy(word, pat_get_line(Pat_dicfile[0],top_ptr->left->il_ptr->index)); - strcpy(pftmp,(word+strlen(prefix))); - - /* - printf("#@# %i\n",strlen(word)); - printf("### %i\n",strlen(pftmp)); - - top_ptr->left->str = (char*)malloc(strlen(word)+1); - strcpy(top_ptr->left->str,word); - */ - top_ptr->left->str = (char*)malloc(strlen(pftmp)+1); - strcpy(top_ptr->left->str,pftmp); - - strcat(prefix,pftmp); - - } else { - /* 左右の Subtree の処理。葉っぱでなければ再帰。*/ - if(top_ptr->checkbit < top_ptr->left->checkbit){ - pat_show_patfile(top_ptr->left,out_to,prefix);} - else { - if(top_ptr->left->il_ptr != NULL) { - strcpy(word, pat_get_line(Pat_dicfile[0],top_ptr->left->il_ptr->index)); - strcpy(pftmp,(word+strlen(prefix))); - - /* - printf("#@# %i\n",strlen(word)); - printf("### %i\n",strlen(pftmp)); - - top_ptr->left->str = (char*)malloc(strlen(word)+1); - strcpy(top_ptr->left->str,word); - */ - top_ptr->left->str = (char*)malloc(strlen(pftmp)+1); - strcpy(top_ptr->left->str,pftmp); - } - } - - } - - if(top_ptr->checkbit < top_ptr->right->checkbit){ - pat_show_patfile(top_ptr->right,out_to,prefix);} - else { - if(top_ptr->right->il_ptr != NULL) { - strcpy(word, pat_get_line(Pat_dicfile[0],top_ptr->right->il_ptr->index)); - strcpy(pftmp,(word+strlen(prefix))); - - /* - printf("#@# %i\n",strlen(word)); - printf("### %i\n",strlen(pftmp)); - - top_ptr->left->str = (char*)malloc(strlen(word)+1); - strcpy(top_ptr->left->str,word); - */ - top_ptr->right->str = (char*)malloc(strlen(pftmp)+1); - strcpy(top_ptr->right->str,pftmp); - } - } - - strcpy(prefix,prefix_keep); - return; - #endif } --- 314,427 ---- int buffer_length; int key_length; char key[500]; ! char buffer[50000]; /* buffer for general use */ x_ptr = pat->root; ! strcpy(key, line); ! key_length = strlen(key); /* the length of key */ ! ! /* search the key */ ! t_ptr = (pat_node *) pat_search4insert(key, x_ptr); ! if ((t_ptr->il).index >= 0) { ! strcpy_tonl(buffer, pat_get_text(pat, (t_ptr->il).index)); + if (strncmp(key, buffer, strlen(key)) == 0) { /* match the key */ list = &(t_ptr->il); ! while (list != NULL) { ! strcpy_tonl(buffer, pat_get_text(pat, list->index)); ! if (strcmp_tonl(buffer, line) == 0) { ! /* ! * 全く同じのがあるので挿入せずにリターン ! * return, because there is entirely same string ! */ return; } mae_wo_sasu_ptr = list; list = list->next; ! } /* この時点で list はリストの末尾を指す */ ! /* `list' point the end of list */ ! /* ! * 既にあるキーに内容をさらに挿入する ! * insert the `content' for the existing `key' ! */ ! new_l_ptr = pat_malloc_index_list(); /* list of index */ new_l_ptr->index = index; new_l_ptr->next = NULL; mae_wo_sasu_ptr->next = new_l_ptr; return; } } else { /* データの無いノードに落ちた場合: 最初にデータをいれたとき */ ! /* when the node has no data or ! when the node is inserted initial data */ ! buffer[0] = buffer[1] = '\0'; /* 16bit */ } ! /* ! * 挿入キーと衝突するキーとの間で 最初に異なる bit ! * の位置(diff_bit)を求める ! * take `diff_bit' which is different ! * between insersion key and collision key ! */ buffer_length = strlen(buffer); ! for (diff_bit = 0; !pat_bits(key, diff_bit, key_length) ! == !pat_bits(buffer, diff_bit, buffer_length); ! diff_bit++) ! ; /* empty sentence */ ! ! /* ! * キーを置く位置(x_ptr)を求める。 ! * take `x_ptr' which is put the `key' ! */ do { p_ptr = x_ptr; ! /* ! * key の checkbitビット目で左右に振り分け ! * right or left by `checkbit'-th bit of key ! */ ! x_ptr = get_next_node(x_ptr, key, x_ptr->checkbit, key_length); ! } while ((x_ptr->checkbit < diff_bit) && (p_ptr->checkbit < x_ptr->checkbit)); ! /* ! * 挿入するノードを生成しキー・検査ビット等を設定する。 ! * make the new node to insert, ! * define `checkbit' etc.. ! */ ! new_ptr = pat_malloc_node(); /* make new node */ ! new_ptr->checkbit = diff_bit; /* define checkbit */ ! (new_ptr->il).index = index; /* define index in list */ ! (new_ptr->il).next = NULL; /* define next index in list */ ! ! /* ! * define `mother node' and `daughter node' ! */ ! /* ! * ビットが1なら右リンクがキーのある位置を指す。0なら左リンク。 ! * when bit is `1', right link point the position of key. ! * when bit is `0', left link point the position of key. ! */ ! if (pat_bits(key, new_ptr->checkbit, key_length)) { new_ptr->right = new_ptr; new_ptr->left = x_ptr; } else { new_ptr->left = new_ptr; new_ptr->right = x_ptr; } ! /* ! * ビットが1なら、親の右につなぐ。0なら左。 ! * when bit is `1', connect to right of `mother node'. ! * when bit is `0', connect to left of `mother node'. ! */ ! if (pat_bits(key, p_ptr->checkbit, key_length)) p_ptr->right = new_ptr; else p_ptr->left = new_ptr; return; } diff -crN chasen-2.2.3/lib/pat.h chasen-2.2.4/lib/pat.h *** chasen-2.2.3/lib/pat.h Wed Dec 6 23:57:41 2000 --- chasen-2.2.4/lib/pat.h Fri Mar 2 09:09:17 2001 *************** *** 1,84 **** #ifndef __PAT_H__ #define __PAT_H__ #include "config.h" #include - #define SIKII_BIT 16 /* 文字と文字の区切りは何ビット目? (8 or 16) */ ! /* インデックス用のリスト型の定義 */ typedef struct __pat_index_list { ! struct __pat_index_list *next; /* つぎ */ ! long index; /* ファイルのインデックス */ } pat_index_list; ! /* ノードのデータ構造の定義 */ typedef struct pat_node { ! pat_index_list il; /* インデックスのリスト */ ! short checkbit; /* チェックするビットの指定。(何番目のビット?) */ ! struct pat_node *right; /* 右ノード */ ! struct pat_node *left; /* 左ノード */ } pat_node; typedef struct __pat_h { ! pat_node *root; /* root node */ ! void *_map; /* pointer of the mapped text */ ! off_t _size; /* size of the mapped text */ } pat_t; ! ! /************************** ! * 関数のプロトタイプ宣言 * ! **************************/ pat_t *pat_open(char*, char*); void pat_load(pat_t*, char*); void pat_save(pat_t*, char*); void pat_text_reopen(pat_t*, char*); #define pat_text_size(pat) ((pat)->_size) #define pat_get_text(pat, pos) ((char *)((pat)->_map + (pos))) ! /* pat.c */ ! /* パトリシア木で検索 */ pat_node *pat_search(pat_t*, char*, char**); pat_node *pat_search_exact(pat_t*, char*, char**); ! /* パトリシア木に挿入 */ void pat_insert(pat_t *, char*, long); - /* パトリシア木データを出力 */ - void pat_show_patfile(pat_t*, FILE*, char*); ! /* patfile.c */ ! pat_node *pat_malloc_node(void); /* Matomete malloc */ ! pat_index_list *pat_malloc_index_list(void); /* Matomete malloc */ ! ! /************************************************************************ ! * ! * pat --- パトリシア木の探索と挿入 ! * ! * 作者: たつを(tatuo-y@is.aist-nara.ac.jp) ! * ! * 目的: パトリシア木の探索と挿入を行う ! * ! * 参考文献: ! * アルゴリズムの理解のために文献[1]を参照した。C言語での実装は ! * 文献[2]のプログラムを参考にした。 ! * [1] R. Sedgewick 著 野下浩平、星守、佐藤創、田口東 共訳 ! * アルゴリズム (Algorithms) 原書第2版 第2巻 探索・文字列・計算幾何 ! * 近代科学社,1992. (B195-2,pp.68-72) ! * [2] 島内剛一、有澤誠、野下浩平、浜田穂積、伏見正則 編集委員 ! * アルゴリズム辞典 ! * 共立出版株式会社,1994. (D74,pp.624-625) ! * ! * 履歴: ! * 1996/04/09 動く! (ただし扱えるデータの最大長は8bit。[2]を模倣。) ! * 10 出力ルーチンを再帰に改良。文字列データ対応(最大長無制限)。 ! * 30 セーブ/ロード機能。ノードのデータ構造にID番号を追加(仮)。 ! * 5/06 部分木の全データ出力処理。 ! * 6/11 ChaSenの辞書引き用に改造. ! * 21 連想配列を導入(INDEXをキャッシュする) ! * 7/01 複数の辞書ファイル(パト木)から検索できるようにした. ! * ! * メモ: ChaSenの辞書引きに利用する ! * ! ************************************************************************/ #endif /* __PAT_H__ */ --- 1,155 ---- + /* + * $Id: pat.h,v 1.15 2001/03/02 00:09:17 masayu-a Exp $ + */ + #ifndef __PAT_H__ #define __PAT_H__ #include "config.h" + + #ifdef HAVE_UNISTD_H #include + #endif + + #define SIKII_BIT 16 /* which bit is word segmentation? (8 or 16) */ ! /* list for indexes */ typedef struct __pat_index_list { ! struct __pat_index_list *next; /* next */ ! long index; /* index to file */ } pat_index_list; ! /* node of patricia tree */ typedef struct pat_node { ! pat_index_list il; /* list of index */ ! short checkbit; /* which bit should be checked? */ ! struct pat_node *right; /* right node */ ! struct pat_node *left; /* left node */ } pat_node; + /* patricia tree */ typedef struct __pat_h { ! pat_node *root; /* pointer to the root node */ ! void *_map; /* pointer of the mapped text */ ! off_t _size; /* size of the mapped text */ } pat_t; ! /* ! * functions in patfile.c ! */ ! ! /* ! pat_open -- open the patricia tree ! parameter: pat_open(char *textfile, char *patfile) ! return: pat_t* ! */ pat_t *pat_open(char*, char*); + + /* + pat_load -- load the patricia tree + parameter: pat_load(pat_t * pat, char *patfile) + return: none + */ void pat_load(pat_t*, char*); + + /* + pat_save -- save the patricia tree + parameter: pat_save(pat_t * pat, char *patfile) + return: none + */ void pat_save(pat_t*, char*); + + /* + pat_text_reopen -- reopen textfile + parameter: pat_text_reopen(pat_t * pat, char *textfile) + return: none + */ void pat_text_reopen(pat_t*, char*); + /* + pat_text_size -- return the size of text file + parameter: pat_text_size(pat_t * pat) + return: the size of text file + */ #define pat_text_size(pat) ((pat)->_size) + + /* + pat_get_text -- get text + parameter: pat_get_text(pat_t * pat, position) + return: (char *) string + */ #define pat_get_text(pat, pos) ((char *)((pat)->_map + (pos))) ! /* ! * functions in pat.c ! */ ! /* ! pat_search -- search the key in patricia tree exactly ! parameter: pat_search(pat_t * pat, char *key, char **result) ! return: pointer to a node which the search ended ! */ pat_node *pat_search(pat_t*, char*, char**); + + /* + pat_search_exact -- search the key in patricia tree + parameter: pat_search_exact(pat_t * pat, char *key, char **result) + return: pointer to a node which the search ended + */ pat_node *pat_search_exact(pat_t*, char*, char**); ! ! /* ! pat_insert -- insert data for patricia tree ! parameter: pat_insert(pat_t * pat, char *line, long index) ! return: none ! */ void pat_insert(pat_t *, char*, long); ! /* ! * functions in patfile.c ! */ ! /* ! pat_malloc_node -- malloc for pat_node ! parameter: none ! return: pat_node ! */ ! pat_node *pat_malloc_node(void); ! ! /* ! pat_malloc_index_list -- malloc for pat_index_list ! parameter: none ! return: pat_index_list ! */ ! pat_index_list *pat_malloc_index_list(void); ! ! /* ! * ! * pat --- パトリシア木の探索と挿入 ! * ! * 作者: たつを(tatuo-y@is.aist-nara.ac.jp) ! * ! * 目的: パトリシア木の探索と挿入を行う ! * ! * 参考文献: ! * アルゴリズムの理解のために文献[1]を参照した。C言語での実装は ! * 文献[2]のプログラムを参考にした。 ! * [1] R. Sedgewick 著 野下浩平、星守、佐藤創、田口東 共訳 ! * アルゴリズム (Algorithms) 原書第2版 第2巻 探索・文字列・計算幾何 ! * 近代科学社,1992. (B195-2,pp.68-72) ! * [2] 島内剛一、有澤誠、野下浩平、浜田穂積、伏見正則 編集委員 ! * アルゴリズム辞典 ! * 共立出版株式会社,1994. (D74,pp.624-625) ! * ! * 履歴: ! * 1996/04/09 動く! (ただし扱えるデータの最大長は8bit。[2]を模倣。) ! * 10 出力ルーチンを再帰に改良。文字列データ対応(最大長無制限)。 ! * 30 セーブ/ロード機能。ノードのデータ構造にID番号を追加(仮)。 ! * 5/06 部分木の全データ出力処理。 ! * 6/11 ChaSenの辞書引き用に改造. ! * 21 連想配列を導入(INDEXをキャッシュする) ! * 7/01 複数の辞書ファイル(パト木)から検索できるようにした. ! * ! * メモ: ChaSenの辞書引きに利用する ! * ! */ #endif /* __PAT_H__ */ diff -crN chasen-2.2.3/lib/patfile.c chasen-2.2.4/lib/patfile.c *** chasen-2.2.3/lib/patfile.c Wed Feb 14 09:20:53 2001 --- chasen-2.2.4/lib/patfile.c Fri Mar 2 09:09:17 2001 *************** *** 35,62 **** * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ! * $Id: patfile.c,v 1.10 2001/02/14 00:20:53 masayu-a Exp $ */ #include "config.h" #include - #ifndef _WIN32 - #include - #endif - #include - #include #include "chadic.h" #include "pat.h" pat_node *pat_malloc_node(void); ! static void pat_init_tree_top(pat_node*); ! static void pat_com_l(char*, pat_node*); ! static void pat_com_s(char*, pat_node*); /* * pat_open */ ! pat_t *pat_open(char *textfile, char *patfile) { pat_t *pat; void *map; --- 35,58 ---- * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ! * $Id: patfile.c,v 1.15 2001/03/02 00:09:17 masayu-a Exp $ */ #include "config.h" #include #include "chadic.h" #include "pat.h" pat_node *pat_malloc_node(void); ! static void pat_init_tree_top(pat_node *); ! static void pat_com_l(char *, pat_node *); ! static void pat_com_s(char *, pat_node *); /* * pat_open */ ! pat_t * ! pat_open(char *textfile, char *patfile) { pat_t *pat; void *map; *************** *** 73,102 **** return pat; } ! void pat_load(pat_t *pat, char *patfile) { pat_com_l(patfile, pat->root); } ! void pat_save(pat_t *pat, char *patfile) { pat_com_s(patfile, pat->root); } ! void pat_text_reopen(pat_t *pat, char *textfile) { void *map; cha_munmap_file(pat->_map, pat->_size); pat = cha_malloc(sizeof(pat_t)); pat->_size = cha_mmap_file(textfile, &map); ! pat->_map = map; } /* * subroutines for pat_load_anode() */ ! pat_index_list *pat_malloc_index_list(void) { static int idx = 1024; static pat_index_list *ptr; --- 69,102 ---- return pat; } ! void ! pat_load(pat_t * pat, char *patfile) { pat_com_l(patfile, pat->root); } ! void ! pat_save(pat_t * pat, char *patfile) { pat_com_s(patfile, pat->root); } ! void ! pat_text_reopen(pat_t * pat, char *textfile) { void *map; cha_munmap_file(pat->_map, pat->_size); pat = cha_malloc(sizeof(pat_t)); pat->_size = cha_mmap_file(textfile, &map); ! pat->_map = map; } /* * subroutines for pat_load_anode() */ ! pat_index_list * ! pat_malloc_index_list(void) { static int idx = 1024; static pat_index_list *ptr; *************** *** 109,115 **** return ptr + idx++; } ! pat_node *pat_malloc_node(void) { static int idx = 1024; static pat_node *ptr; --- 109,116 ---- return ptr + idx++; } ! pat_node * ! pat_malloc_node(void) { static int idx = 1024; static pat_node *ptr; *************** *** 122,167 **** return ptr + idx++; } ! static void dummy(FILE *fp) { ! fputc(0xff,fp); ! fputc(0xff,fp); ! fputc(0xff,fp); ! fputc(0xff,fp); ! } ! ! /**************************************************** ! * pat_load_anode --- パトリシア木をロード ! * by 米沢恵司(keiji-y@is.aist-nara.ac.jp) ! * ! * パラメータ ! * in --- 入力先ファイル ! * p_ptr --- このノードが外部接点であった時にインデックスを格納する場所 ! * 内部接点であったときは、このポインタは右の子に渡される。 ! * ! * アルゴリズム ! * チェックビットを読み込んだら、それは内部接点だから新しくノードを作る ! * 左部分木、右部分木の順に再帰する ! * 左再帰の時は新しく作ったこの接点のポインタを、 ! * 右再帰の時は p_ptr をインデックスの格納場所として渡す。 ! * インデックスを読み込んだら、それは外部接点だから、p_ptr->index に格納 ! * ! * メモ ! * インデックスの格納場所が元と違うが、特に問題ない。 ! *************************************************************************/ ! static pat_node *pat_load_anode(pat_node *p_ptr, FILE *fp) { unsigned char c; ! pat_node *new_ptr; /* 新しく作ったノード(==このノード)を指すポインタ */ long tmp_idx; ! pat_index_list *new_l_ptr,*t_ptr=NULL; ! ! #if 0 ! static int dbg; ! if (!(++dbg%1000))printf("%d,",dbg); ! #endif ! if ((c = fgetc(fp)) & 0x80) { /* 葉っぱの処理、インデックスの読み込み */ while (c & 0x80) { tmp_idx = (c & 0x3f) << 24; tmp_idx |= fgetc(fp) << 16; --- 123,181 ---- return ptr + idx++; } ! static void ! dummy(FILE * fp) { ! fputc(0xff, fp); ! fputc(0xff, fp); ! fputc(0xff, fp); ! fputc(0xff, fp); ! } ! ! /* ! * pat_load_anode --- パトリシア木をロード ! * load patricia tree ! * by 米沢恵司(keiji-y@is.aist-nara.ac.jp) ! * ! * parameters: ! * p_ptr --- このノードが外部接点であった時にインデックスを格納する場所 ! * 内部接点であったときは、このポインタは右の子に渡される。 ! * when the node is the outside node, p_ptr is the index. ! * when the node is the inside node, ! * this pointer pass to right tree. ! * fp --- input file ! * ! * ! * Algorithm ! * チェックビットを読み込んだら、それは内部接点だから新しくノードを作る ! * 左部分木、右部分木の順に再帰する ! * 左再帰の時は新しく作ったこの接点のポインタを、 ! * 右再帰の時は p_ptr をインデックスの格納場所として渡す。 ! * インデックスを読み込んだら、それは外部接点だから、p_ptr->index に格納 ! * ! * When read `checkbit', it will be inside node. ! * So make a new node. ! * And do recursion to left subtree and right subtree in this order. ! * When left subtree recursion, return pointer to this new node. ! * When right subtree recursion, return p_ptr which contains `index'. ! * When read `index', it will be outside node. ! * Put `index' to p_ptr->index. ! * ! * memo ! * インデックスの格納場所が元と違うが、特に問題ない。 ! * Where contains `index' is different between original algorithm ! * and this program. But no problem. ! */ ! static pat_node * ! pat_load_anode(pat_node * p_ptr, FILE * fp) { unsigned char c; ! pat_node *new_ptr; /* pointer to new node (= this node) */ ! long tmp_idx; ! pat_index_list *new_l_ptr, *t_ptr = NULL; ! if ((c = fgetc(fp)) & 0x80) { /* process leaves, read index */ while (c & 0x80) { tmp_idx = (c & 0x3f) << 24; tmp_idx |= fgetc(fp) << 16; *************** *** 178,269 **** new_l_ptr->next = NULL; t_ptr = new_l_ptr; ! if (c & 0x40) break; c = fgetc(fp); } ! return (p_ptr); ! } else { /* 内部接点の処理、再帰する */ new_ptr = pat_malloc_node(); ! new_ptr->checkbit = ((c << 8) | fgetc(fp)) - 1; /* チェックビット */ ! #if 0 ! printf("#cb %d\n",new_ptr->checkbit); ! #endif (new_ptr->il).index = -1; new_ptr->left = pat_load_anode(new_ptr, fp); new_ptr->right = pat_load_anode(p_ptr, fp); ! return (new_ptr); } } ! /****************************************************** ! * pat_com_l --- 木のロード ! * by 米沢恵司(keiji-y@is.aist-nara.ac.jp) ! * ! * パラメータ、返し値 ! * なし ! ******************************************************/ ! static void pat_com_l(char *fname_pat, pat_node *ptr) { FILE *fp; - #if 0 - fprintf(stderr, "# Loading pat-tree \"%s\" ... ",fname_pat); - #endif - if ((fp = fopen(fname_pat, "rb")) == NULL) { ! fprintf(stderr, "can't open %s\n",fname_pat); exit(1); } ptr->right = pat_load_anode(ptr, fp); fclose(fp); ! #if 0 ! fprintf(stderr,"done.\n"); ! #endif ! } ! ! /**************************************************** ! * save_pat --- パトリシア木データをセーブ ! * by 米沢恵司(keiji-y@is.aist-nara.ac.jp) ! * ! * パラメータ ! * top_ptr --- 検索開始ノードの位置(ポインタ) ! * out_to --- 出力先(stdoutやファイル) ! * ! * 返し値 ! * 無し。パトリシア木データを出力。 ! * ! * 出力フォーマット --- 8ビットに区切ってバイナリ出力 ! * 左優先探索で内部接点はチェックビット、外部接点はインデックスを出力 ! * チェックビット --- 基本的にそのまま (第 0 ビットが 0) ! * ただし -1 のとき困るので 1 を足す ! * インデックス --- 第 0 ビットを 1 にする ! ****************************************************/ ! static void save_pat(pat_node *top_ptr, FILE *fp) { pat_index_list *ptr; ! /* 内部接点の処理、チェックビットを出力 */ ! fputc (((top_ptr->checkbit + 1)>> 8) & 0x7f, fp); ! fputc ((top_ptr->checkbit + 1)& 0xff, fp); ! ! /* 左右の Subtree の処理。葉っぱならインデックスを出力、 ! 葉っぱでなければ再帰。*/ if (top_ptr->checkbit < top_ptr->left->checkbit) save_pat(top_ptr->left, fp); else { ptr = &(top_ptr->left->il); ! if (ptr->index < 0) dummy(fp); else { while (ptr != NULL) { if (ptr->next == NULL) ! fputc (((ptr->index >> 24) & 0x3f) | 0xc0, fp); else ! fputc (((ptr->index >> 24) & 0x3f) | 0x80, fp); ! fputc ((ptr->index >> 16) & 0xff, fp); ! fputc ((ptr->index >> 8) & 0xff, fp); ! fputc ((ptr->index) & 0xff, fp); ptr = ptr->next; } } --- 192,294 ---- new_l_ptr->next = NULL; t_ptr = new_l_ptr; ! if (c & 0x40) ! break; c = fgetc(fp); } ! return p_ptr; ! } else { /* process of inside node (recursive) */ new_ptr = pat_malloc_node(); ! new_ptr->checkbit = ((c << 8) | fgetc(fp)) - 1; /* checkbit */ (new_ptr->il).index = -1; new_ptr->left = pat_load_anode(new_ptr, fp); new_ptr->right = pat_load_anode(p_ptr, fp); ! return new_ptr; } } ! /* ! * pat_com_l --- load tree ! * ! * by 米沢恵司(keiji-y@is.aist-nara.ac.jp) ! */ ! static void ! pat_com_l(char *fname_pat, pat_node * ptr) { FILE *fp; if ((fp = fopen(fname_pat, "rb")) == NULL) { ! fprintf(stderr, "can't open %s\n", fname_pat); exit(1); } ptr->right = pat_load_anode(ptr, fp); fclose(fp); ! } ! ! /* ! * save_pat --- パトリシア木データをセーブ ! * save patricia tree data ! * by 米沢恵司(keiji-y@is.aist-nara.ac.jp) ! * ! * parameters: ! * top_ptr --- 検索開始ノードの位置(ポインタ) ! * pointer to a node -- search starting point ! * fp --- 出力先(stdoutやファイル) ! * output (stdout or FILE) ! * ! * return: ! * none. ! * output patricia tree to `fp' ! * ! * 出力フォーマット --- 8ビットに区切ってバイナリ出力 ! * output format --- segment to 8bit per unit ! * 左優先探索で内部接点はチェックビット、外部接点はインデックスを出力 ! * チェックビット --- 基本的にそのまま (第 0 ビットが 0) ! * ただし -1 のとき困るので 1 を足す ! * インデックス --- 第 0 ビットを 1 にする ! * left most search: ! * if inside node, output `checkbit' ! * if outside node, output `index' ! * `checkbit': basically we use original bit ! * 0 bit will be `0' ! * But when -1 bit, plus 1. ! * `index': 0 bit will be `1' ! */ ! static void ! save_pat(pat_node * top_ptr, FILE * fp) { pat_index_list *ptr; ! /* ! * 内部接点の処理、チェックビットを出力 ! * process inside node, output checkbit ! */ ! fputc(((top_ptr->checkbit + 1) >> 8) & 0x7f, fp); ! fputc((top_ptr->checkbit + 1) & 0xff, fp); ! ! /* ! * 左右の Subtree の処理。葉っぱならインデックスを出力、 ! * 葉っぱでなければ再帰。 ! * process subtree, ! * if node is a leaf, output index ! * otherwise, do recursion ! */ if (top_ptr->checkbit < top_ptr->left->checkbit) save_pat(top_ptr->left, fp); else { ptr = &(top_ptr->left->il); ! if (ptr->index < 0) dummy(fp); else { while (ptr != NULL) { if (ptr->next == NULL) ! fputc(((ptr->index >> 24) & 0x3f) | 0xc0, fp); else ! fputc(((ptr->index >> 24) & 0x3f) | 0x80, fp); ! fputc((ptr->index >> 16) & 0xff, fp); ! fputc((ptr->index >> 8) & 0xff, fp); ! fputc((ptr->index) & 0xff, fp); ptr = ptr->next; } } *************** *** 272,324 **** save_pat(top_ptr->right, fp); else { ptr = &(top_ptr->right->il); ! if (ptr->index < 0) dummy(fp); else { while (ptr != NULL) { if (ptr->next == NULL) ! fputc (((ptr->index >> 24) & 0x3f) | 0xc0, fp); else ! fputc (((ptr->index >> 24) & 0x3f) | 0x80, fp); ! fputc ((ptr->index >> 16) & 0xff, fp); ! fputc ((ptr->index >> 8) & 0xff, fp); ! fputc ((ptr->index) & 0xff, fp); ptr = ptr->next; } } } } ! /***************************************************** ! * pat_com_s --- 木のセーブ ! * by 米沢恵司(keiji-y@is.aist-nara.ac.jp) ! * ! * パラメータ、返し値 ! * なし ! *****************************************************/ ! static void pat_com_s(char *fname_pat, pat_node *ptr) { FILE *fp; ! printf("Saving pat-tree \"%s\" ...\n",fname_pat); fp = fopen(fname_pat, "w+b"); if (fp == NULL) { fprintf(stderr, "can't open %s\n", fname_pat); exit(1); ! }; ! save_pat(ptr->right, fp); /* ファイル出力 */ fclose(fp); } ! /****************************************************** ! * pat_init_tree_top --- パトリシア木の根の初期化 ! * ! * パラメータ ! * ptr --- 初期化する木の根へのポインタ ! ******************************************************/ ! static void pat_init_tree_top(pat_node *ptr) ! { ! (ptr->il).index = -1; /* インデックスのリスト */ ! ptr->checkbit = -1; ! ptr->right = ptr; ! ptr->left = ptr; } --- 297,350 ---- save_pat(top_ptr->right, fp); else { ptr = &(top_ptr->right->il); ! if (ptr->index < 0) ! dummy(fp); else { while (ptr != NULL) { if (ptr->next == NULL) ! fputc(((ptr->index >> 24) & 0x3f) | 0xc0, fp); else ! fputc(((ptr->index >> 24) & 0x3f) | 0x80, fp); ! fputc((ptr->index >> 16) & 0xff, fp); ! fputc((ptr->index >> 8) & 0xff, fp); ! fputc((ptr->index) & 0xff, fp); ptr = ptr->next; } } } } ! /* ! * pat_com_s --- save a patricia tree to file ! * ! * by 米沢恵司(keiji-y@is.aist-nara.ac.jp) ! */ ! static void ! pat_com_s(char *fname_pat, pat_node * ptr) { FILE *fp; ! printf("Saving pat-tree \"%s\" ...\n", fname_pat); fp = fopen(fname_pat, "w+b"); if (fp == NULL) { fprintf(stderr, "can't open %s\n", fname_pat); exit(1); ! }; ! save_pat(ptr->right, fp); /* output to file */ fclose(fp); } ! /* ! * pat_init_tree_top --- initialize a root of patricia tree ! * ! * parameter: ! * ptr --- pointer to a root of patricia tree ! */ ! static void ! pat_init_tree_top(pat_node * ptr) ! { ! (ptr->il).index = -1; /* list of index is -1 */ ! ptr->checkbit = -1; /* checkbit is -1 */ ! ptr->right = ptr; /* right node point itself */ ! ptr->left = ptr; /* left node point itself */ } diff -crN chasen-2.2.3/lib/print.c chasen-2.2.4/lib/print.c *** chasen-2.2.3/lib/print.c Fri Feb 23 21:51:34 2001 --- chasen-2.2.4/lib/print.c Sat Mar 17 08:44:18 2001 *************** *** 37,48 **** * * modified by A.Kitauchi , Sep. 1996 * by O.Imaichi , Sep. 1996 ! * $Id: print.c,v 1.9 2001/02/23 12:51:34 kazuma-t Exp $ */ #include #include #include "chalib.h" #include "pat.h" #define CHA_OUTPUT_SIZE (1024*16) --- 37,50 ---- * * modified by A.Kitauchi , Sep. 1996 * by O.Imaichi , Sep. 1996 ! * $Id: print.c,v 1.17.2.1 2001/03/16 23:44:18 masayu-a Exp $ */ #include #include + #include #include "chalib.h" + #include "tokenizer.h" #include "pat.h" #define CHA_OUTPUT_SIZE (1024*16) *************** *** 50,95 **** static int path_buffer[CHA_INPUT_SIZE]; static int is_bol = 1; static int pos_end = 0; - static char *sentence = ""; - static short *undefword_len; - static char *char_type; ! static void (*cha_putc)(), (*cha_puts)(), (*cha_printf)(); ! static void (*cha_fputc)(), (*cha_fputs)(), (*cha_fprintf)(); ! void cha_set_sentence(char *str, short *lenp, char *typep) { pos_end = 0; - sentence = str; - undefword_len = lenp; - char_type = typep; } /* * cha_clputc, cha_clputs, cha_clprintf ! * - output functions for ChaSen client */ ! static void cha_clputc(int c, FILE *output) { if (is_bol && c == '.') ! putc('.', output); putc(c, output); is_bol = c == '\n' ? 1 : 0; } ! static void cha_clputs(char *s, FILE *output) { if (is_bol && s[0] == '.') ! putc('.', output); fputs(s, output); ! is_bol = s[strlen(s) - 1] == '\n' ? 1 : 0; } ! static void cha_clprintf(FILE *output, char *format, ...) { char tmpbuf[CHA_INPUT_SIZE]; va_list ap; --- 52,95 ---- static int path_buffer[CHA_INPUT_SIZE]; static int is_bol = 1; static int pos_end = 0; ! static void (*cha_putc) (), (*cha_puts) (), (*cha_printf) (); ! static void (*cha_fputc) (), (*cha_fputs) (), (*cha_fprintf) (); ! void ! cha_print_reset(void) { pos_end = 0; } /* * cha_clputc, cha_clputs, cha_clprintf ! * - output functions for ChaSen client */ ! static void ! cha_clputc(int c, FILE * output) { if (is_bol && c == '.') ! putc('.', output); putc(c, output); is_bol = c == '\n' ? 1 : 0; } ! static void ! cha_clputs(char *s, FILE * output) { if (is_bol && s[0] == '.') ! putc('.', output); fputs(s, output); ! is_bol = s[strlen(s) - 1] == '\n' ? 1 : 0; } ! static void ! cha_clprintf(FILE * output, char *format, ...) { char tmpbuf[CHA_INPUT_SIZE]; va_list ap; *************** *** 99,105 **** va_end(ap); if (is_bol && tmpbuf[0] == '.') ! putc('.', output); fputs(tmpbuf, output); --- 99,105 ---- va_end(ap); if (is_bol && tmpbuf[0] == '.') ! putc('.', output); fputs(tmpbuf, output); *************** *** 108,114 **** /* * cha_sputc, cha_sputs, cha_sprintf ! * - output fuctions to string * * NOTE: `output' is a dummy argument for compatibility with cha_clputc, etc. * --- 108,114 ---- /* * cha_sputc, cha_sputs, cha_sprintf ! * - output fuctions to string * * NOTE: `output' is a dummy argument for compatibility with cha_clputc, etc. * *************** *** 118,128 **** static int cha_output_idx; static int cha_output_nblock; ! static void cha_sputc(int c, char *output /* dummy */) { if (cha_output_idx + 1 >= CHA_OUTPUT_SIZE * cha_output_nblock && cha_output) { ! cha_output = realloc(cha_output, CHA_OUTPUT_SIZE * ++cha_output_nblock); } if (cha_output) { --- 118,130 ---- static int cha_output_idx; static int cha_output_nblock; ! static void ! cha_sputc(int c, char *output /* dummy */ ) { if (cha_output_idx + 1 >= CHA_OUTPUT_SIZE * cha_output_nblock && cha_output) { ! cha_output = ! realloc(cha_output, CHA_OUTPUT_SIZE * ++cha_output_nblock); } if (cha_output) { *************** *** 131,143 **** } } ! static void cha_sputs(char *s, char *output) { int len = strlen(s); if (cha_output_idx + len >= CHA_OUTPUT_SIZE * cha_output_nblock && cha_output) { ! cha_output = realloc(cha_output, CHA_OUTPUT_SIZE * ++cha_output_nblock); } if (cha_output) { --- 133,147 ---- } } ! static void ! cha_sputs(char *s, char *output) { int len = strlen(s); if (cha_output_idx + len >= CHA_OUTPUT_SIZE * cha_output_nblock && cha_output) { ! cha_output = ! realloc(cha_output, CHA_OUTPUT_SIZE * ++cha_output_nblock); } if (cha_output) { *************** *** 146,152 **** } } ! static void cha_sprintf(char *output, char *format, ...) { char tmpbuf[CHA_INPUT_SIZE]; va_list ap; --- 150,157 ---- } } ! static void ! cha_sprintf(char *output, char *format, ...) { char tmpbuf[CHA_INPUT_SIZE]; va_list ap; *************** *** 157,212 **** cha_sputs(tmpbuf, output); } ! void cha_set_fput(int server_mode) { ! /* * For system having no prototype declarations for the following * functions such as SunOS 4.1.4. */ ! extern int fputc(int, FILE*); ! extern int fputs(const char*, FILE*); ! extern int fprintf(FILE*, const char*, ...); if (server_mode) { ! cha_fputc = (void (*))cha_clputc; ! cha_fputs = (void (*))cha_clputs; ! cha_fprintf = (void (*))cha_clprintf; } else { ! cha_fputc = (void (*))fputc; ! cha_fputs = (void (*))fputs; ! cha_fprintf = (void (*))fprintf; } } ! void cha_set_output(FILE *output) { if (output == NULL) { ! /* output to string */ ! cha_putc = (void (*))cha_sputc; ! cha_puts = (void (*))cha_sputs; ! cha_printf = (void (*))cha_sprintf; ! /* initialize output buffer */ if (cha_output_nblock > 1) { free(cha_output); cha_output_nblock = 0; } if (cha_output_nblock == 0) ! cha_output = malloc(CHA_OUTPUT_SIZE * ++cha_output_nblock); cha_output_idx = 0; cha_output[0] = '\0'; } else { ! /* output to file */ ! cha_output = (char *)output; ! cha_putc = (void (*))cha_fputc; ! cha_puts = (void (*))cha_fputs; ! cha_printf = (void (*))cha_fprintf; } } /* * returns cha_output for chasen_[fs]arse_tostr() */ ! char *cha_get_output(void) { return cha_output; } --- 162,226 ---- cha_sputs(tmpbuf, output); } ! void ! cha_set_fput(int server_mode) { ! /* * For system having no prototype declarations for the following * functions such as SunOS 4.1.4. */ ! extern int fputc(int, FILE *); ! extern int fputs(const char *, FILE *); ! extern int fprintf(FILE *, const char *, ...); if (server_mode) { ! cha_fputc = (void (*)) cha_clputc; ! cha_fputs = (void (*)) cha_clputs; ! cha_fprintf = (void (*)) cha_clprintf; } else { ! cha_fputc = (void (*)) fputc; ! cha_fputs = (void (*)) fputs; ! cha_fprintf = (void (*)) fprintf; } } ! void ! cha_set_output(FILE * output) { if (output == NULL) { ! /* ! * output to string ! */ ! cha_putc = (void (*)) cha_sputc; ! cha_puts = (void (*)) cha_sputs; ! cha_printf = (void (*)) cha_sprintf; ! /* ! * initialize output buffer ! */ if (cha_output_nblock > 1) { free(cha_output); cha_output_nblock = 0; } if (cha_output_nblock == 0) ! cha_output = malloc(CHA_OUTPUT_SIZE * ++cha_output_nblock); cha_output_idx = 0; cha_output[0] = '\0'; } else { ! /* ! * output to file ! */ ! cha_output = (char *) output; ! cha_putc = (void (*)) cha_fputc; ! cha_puts = (void (*)) cha_fputs; ! cha_printf = (void (*)) cha_fprintf; } } /* * returns cha_output for chasen_[fs]arse_tostr() */ ! char * ! cha_get_output(void) { return cha_output; } *************** *** 214,283 **** /* * cha_printf_mrph - print morpheme using format string * * format string: ! * %m 見出し(活用形) ! * %M 見出し(基本形) ! * %y 読みの第一候補(活用形) ! * %Y 読み第一候補(基本形) ! * %y0 読み全体(活用形) ! * %Y0 読み全体(基本形) ! * %a 発音の第一候補(活用形) ! * %A 発音の第一候補(基本形) ! * %a0 発音全体(活用形) ! * %A0 発音全体(基本形) * %rabc ルビつきの見出し("a見出しb読みc" と表示) ! * %i 付加情報 ! * %Ic 付加情報(空文字列か"NIL"なら文字c) ! * %Pc 各階層の品詞を文字cで区切った文字列(vgramのみ) ! * %Pnc 1〜n(n:1〜9)階層目までの品詞を文字cで区切った文字列(vgramのみ) ! * %h 品詞の番号 ! * %H 品詞(vgramの場合は1階層目) ! * %Hn n(n:1〜9)階層目の品詞(なければ最も深い階層)(vgramのみ) ! * %b 品詞細分類の番号(vgramの場合は0) ! * %BB 品詞細分類(なければ品詞) ! * %Bc 品詞細分類(なければ文字c) ! * %t 活用型の番号 ! * %Tc 活用型(なければ文字c) ! * %f 活用形の番号 ! * %Fc 活用形(なければ文字c) ! * %c 形態素のコスト ! * %S 解析文全体 ! * %pb 最適パスであれば "*", そうでなければ " " ! * %pi パスの番号 ! * %ps パスの形態素の開始位置 ! * %pe パスの形態素の終了位置+1 ! * %pc パスのコスト ! * %ppiC 前に接続するパスの番号を文字Cで区切り列挙 ! * %ppcC 前に接続するパスのコストを文字Cで区切り列挙 * %rABC,%Ic,%Bc,%Tc,%Fc については A,B,C,c が空白文字の時は何も ! * 表示しない * ! * %?B/STR1/STR2/ 品詞細分類があればSTR1、なければSTR2 ! * %?I/STR1/STR2/ 付加情報がNILでも""でもなければSTR1、そうでなければSTR2 ! * %?T/STR1/STR2/ 活用があればSTR1、なければSTR2 ! * %?F/STR1/STR2/ 活用があればSTR1、なければSTR2 ! * %?U/STR1/STR2/ 未定義語ならSTR1、そうでなければSTR2 ! * %U/STR/ 未定義語なら"未定義語"(vgramの場合は"未知語")、 ! * そうでなければSTR(%?U/未知語/STR/ と同じ) ! * `/'には任意の文字が使える。 ! * また、括弧「(){}[]<>」を使った以下のような形式が使える。 ! * %?B(STR1)(STR2) %?B{STR1}/STR2/ %?U[STR] * ! * %% % そのもの ! * . フィールド幅を指定 ! * - フィールド幅を指定 ! * 1-9 フィールド幅を指定 ! * \n 改行文字 ! * \t タブ ! * \\ \ そのもの ! * \' ' そのもの ! * \" " そのもの * * example: ! * "%m %y %M %h %b %t %f\n" same as -c option ! * "%m %U(%y) %M %H %h %B* %b %T* %t %F* %f\n" same as -e option */ ! static int check_con_cost(path_t *path, int con_tbl) { int con_cost; --- 228,300 ---- /* * cha_printf_mrph - print morpheme using format string * + * about the format of English please see `manual.tex' + * * format string: ! * %m 見出し(活用形) ! * %M 見出し(基本形) ! * %y 読みの第一候補(活用形) ! * %Y 読み第一候補(基本形) ! * %y0 読み全体(活用形) ! * %Y0 読み全体(基本形) ! * %a 発音の第一候補(活用形) ! * %A 発音の第一候補(基本形) ! * %a0 発音全体(活用形) ! * %A0 発音全体(基本形) * %rabc ルビつきの見出し("a見出しb読みc" と表示) ! * %i 付加情報 ! * %Ic 付加情報(空文字列か"NIL"なら文字c) ! * %Pc 各階層の品詞を文字cで区切った文字列(vgramのみ) ! * %Pnc 1〜n(n:1〜9)階層目までの品詞を文字cで区切った文字列(vgramのみ) ! * %h 品詞の番号 ! * %H 品詞(vgramの場合は1階層目) ! * %Hn n(n:1〜9)階層目の品詞(なければ最も深い階層)(vgramのみ) ! * %b 品詞細分類の番号(vgramの場合は0) ! * %BB 品詞細分類(なければ品詞) ! * %Bc 品詞細分類(なければ文字c) ! * %t 活用型の番号 ! * %Tc 活用型(なければ文字c) ! * %f 活用形の番号 ! * %Fc 活用形(なければ文字c) ! * %c 形態素のコスト ! * %S 解析文全体 ! * %pb 最適パスであれば "*", そうでなければ " " ! * %pi パスの番号 ! * %ps パスの形態素の開始位置 ! * %pe パスの形態素の終了位置+1 ! * %pc パスのコスト ! * %ppiC 前に接続するパスの番号を文字Cで区切り列挙 ! * %ppcC 前に接続するパスのコストを文字Cで区切り列挙 * %rABC,%Ic,%Bc,%Tc,%Fc については A,B,C,c が空白文字の時は何も ! * 表示しない * ! * %?B/STR1/STR2/ 品詞細分類があればSTR1、なければSTR2 ! * %?I/STR1/STR2/ 付加情報がNILでも""でもなければSTR1、そうでなければSTR2 ! * %?T/STR1/STR2/ 活用があればSTR1、なければSTR2 ! * %?F/STR1/STR2/ 活用があればSTR1、なければSTR2 ! * %?U/STR1/STR2/ 未定義語ならSTR1、そうでなければSTR2 ! * %U/STR/ 未定義語なら"未定義語"(vgramの場合は"未知語")、 ! * そうでなければSTR(%?U/未知語/STR/ と同じ) ! * `/'には任意の文字が使える。 ! * また、括弧「(){}[]<>」を使った以下のような形式が使える。 ! * %?B(STR1)(STR2) %?B{STR1}/STR2/ %?U[STR] * ! * %% % そのもの ! * . フィールド幅を指定 ! * - フィールド幅を指定 ! * 1-9 フィールド幅を指定 ! * \n 改行文字 ! * \t タブ ! * \\ \ そのもの ! * \' ' そのもの ! * \" " そのもの * * example: ! * "%m %y %M %h %b %t %f\n" same as -c option ! * "%m %U(%y) %M %H %h %B* %b %T* %t %F* %f\n" same as -e option */ ! static int ! check_con_cost(path_t * path, int con_tbl) { int con_cost; *************** *** 287,305 **** return con_cost; } ! static int comm_prefix_len(char *s1, char *s2) { char *s0 = s1; for (; *s1 && *s1 == *s2; s1++, s2++) { ! if ((unsigned char)*s1 & 0x80) ! if (*++s1 != *++s2) ! break; } return s1 - s0; } ! static void set_ruby(char *dest, char *midasi, char *yomi, ! int par1, int par2, int par3) { char *d = dest; char *m = midasi; --- 304,324 ---- return con_cost; } ! static int ! comm_prefix_len(char *s1, char *s2) { char *s0 = s1; for (; *s1 && *s1 == *s2; s1++, s2++) { ! if ((unsigned char) *s1 & 0x80) ! if (*++s1 != *++s2) ! break; } return s1 - s0; } ! static void ! set_ruby(char *dest, char *midasi, char *yomi, ! int par1, int par2, int par3) { char *d = dest; char *m = midasi; *************** *** 311,317 **** int plen, maxplen = 0; for (;;) { ! for (; *y; y += ((unsigned char)*y & 0x80) ? 2 : 1) { if (stat == 0) { stat = 1; if ((plen = comm_prefix_len(m, y)) > 0) { --- 330,336 ---- int plen, maxplen = 0; for (;;) { ! for (; *y; y += ((unsigned char) *y & 0x80) ? 2 : 1) { if (stat == 0) { stat = 1; if ((plen = comm_prefix_len(m, y)) > 0) { *************** *** 323,330 **** m0 = m; y0 = y; if (!*m || !*y) ! goto end_ruby; ! m += ((unsigned char)*m & 0x80) ? 2 : 1; plen = maxplen = 0; continue; } --- 342,349 ---- m0 = m; y0 = y; if (!*m || !*y) ! goto end_ruby; ! m += ((unsigned char) *m & 0x80) ? 2 : 1; plen = maxplen = 0; continue; } *************** *** 335,398 **** } if (maxplen == 0) { if (*m) ! m += ((unsigned char)*m & 0x80) ? 2 : 1; if (!*m) ! ymax = y; ! } if (!*m || maxplen > 0) { y = ymax; ! if (par1 != ' ' ) *d++ = par1; ! memcpy(d, m0, m-m0); d += m-m0; ! if (par2 != ' ' ) *d++ = par2; ! memcpy(d, y0, y-y0); d += y-y0; ! if (par3 != ' ' ) *d++ = par3; if (!*m) ! break; ! stat = 0; } } end_ruby: *d = '\0'; } ! static void print_nhinsi(int hinsi, int c, int n) { short *path; int i; if (c == '\'') ! cha_putc(c, cha_output); path = Cha_hinsi[hinsi].path; ! for (i = 0; ; i++) { cha_puts(Cha_hinsi[*path].name, cha_output); if (!*path || !*++path || i == n) ! break; if (c == '\'') ! cha_puts("'-'", cha_output); else ! cha_putc(c, cha_output); } if (c == '\'') ! cha_putc(c, cha_output); } ! /*********************************************************************** * int_to_str - convert an integer to ASCII * by Masanao Izumo ! ***********************************************************************/ ! static char *int_to_str(int value) { static char buff[32]; ! char* p; int sign; p = buff + 31; if (value >= 0) ! sign = 0; else { ! if (-value == value) { /* value == INT_MIN */ sprintf(buff, "%d", value); return buff; } --- 354,424 ---- } if (maxplen == 0) { if (*m) ! m += ((unsigned char) *m & 0x80) ? 2 : 1; if (!*m) ! ymax = y; ! } if (!*m || maxplen > 0) { y = ymax; ! if (par1 != ' ') ! *d++ = par1; ! memcpy(d, m0, m - m0); ! d += m - m0; ! if (par2 != ' ') ! *d++ = par2; ! memcpy(d, y0, y - y0); ! d += y - y0; ! if (par3 != ' ') ! *d++ = par3; if (!*m) ! break; ! stat = 0; } } end_ruby: *d = '\0'; } ! static void ! print_nhinsi(int hinsi, int c, int n) { short *path; int i; if (c == '\'') ! cha_putc(c, cha_output); path = Cha_hinsi[hinsi].path; ! for (i = 0;; i++) { cha_puts(Cha_hinsi[*path].name, cha_output); if (!*path || !*++path || i == n) ! break; if (c == '\'') ! cha_puts("'-'", cha_output); else ! cha_putc(c, cha_output); } if (c == '\'') ! cha_putc(c, cha_output); } ! /* * int_to_str - convert an integer to ASCII * by Masanao Izumo ! */ ! static char * ! int_to_str(int value) { static char buff[32]; ! char *p; int sign; p = buff + 31; if (value >= 0) ! sign = 0; else { ! if (-value == value) { /* value == INT_MIN */ sprintf(buff, "%d", value); return buff; } *************** *** 404,420 **** *--p = value % 10 + '0'; value /= 10; } while (value > 0); ! if(sign) ! *--p = '-'; return p; ! } ! /*********************************************************************** * fputsn * by Masanao Izumo ! ***********************************************************************/ ! static void fputsn(char *str, char *out, int n) { char buff[256]; int len; --- 430,447 ---- *--p = value % 10 + '0'; value /= 10; } while (value > 0); ! if (sign) ! *--p = '-'; return p; ! } ! /* * fputsn * by Masanao Izumo ! */ ! static void ! fputsn(char *str, char *out, int n) { char buff[256]; int len; *************** *** 429,442 **** } } ! /*********************************************************************** * printf_field * by Masanao Izumo ! ***********************************************************************/ ! static void printf_field(char *width_str, char *word) { char *field = width_str; ! int field_len, word_len; if (width_str == NULL) { cha_puts(word, cha_output); --- 456,476 ---- } } ! /* ad-hoc macros XXX */ ! #define strtoi(s, i) \ ! while (isdigit(*(s))) { (i) = (i) * 10 + *(s) - '0'; (s)++; } ! ! #define field_putsn(w, o, l) \ ! ((l) == -1) ? cha_puts((w), (o)) : fputsn((w), (o), (l)) ! /* * printf_field * by Masanao Izumo ! */ ! static void ! printf_field(char *width_str, char *word) { char *field = width_str; ! int field_len, word_len, wl; if (width_str == NULL) { cha_puts(word, cha_output); *************** *** 444,517 **** } if (*field == '-') ! field++; word_len = -1; field_len = 0; ! while('0' <= *field && *field <= '9') { ! field_len = field_len * 10 + *field - '0'; ! field++; ! } if (*field == '.') { int len = 0; word_len = strlen(word); field++; ! while ('0' <= *field && *field <= '9') { ! len = len * 10 + *field - '0'; ! field++; ! } if (len < word_len) ! word_len = len; } ! if (word_len == -1) { ! word_len = strlen(word); ! if (*width_str == '-') { ! cha_puts(word, cha_output); ! field_len -= word_len; ! while(field_len-- > 0) ! cha_putc(' ', cha_output); ! } else { ! field_len -= word_len; ! while(field_len-- > 0) ! cha_putc(' ', cha_output); ! cha_puts(word, cha_output); ! } } else { ! if (*width_str == '-') { ! fputsn(word, cha_output, word_len); ! field_len -= word_len; ! while(field_len-- > 0) ! cha_putc(' ', cha_output); ! } else { ! field_len -= word_len; ! while(field_len-- > 0) ! cha_putc(' ', cha_output); ! fputsn(word, cha_output, word_len); ! } } } ! static int get_deli_right(int c) { switch (c) { ! case '(': return ')'; ! case '{': return '}'; ! case '[': return ']'; ! case '<': return '>'; ! default: return c; } } ! static void print_anno(int path_num, char *format) { path_t *path = &Cha_path[path_num]; mrph2_t mrph; int start, end; ! if (!Cha_anno_info[0].hinsi && !Cha_anno_info[1].hinsi && !Cha_anno_info[1].format) ! return; if (path->start <= pos_end) { pos_end = path->end; --- 478,541 ---- } if (*field == '-') ! field++; word_len = -1; field_len = 0; ! strtoi(field, field_len); if (*field == '.') { int len = 0; word_len = strlen(word); field++; ! strtoi(field, len); if (len < word_len) ! word_len = len; } ! wl = (word_len == -1) ? strlen(word) : word_len; ! if (*width_str == '-') { ! field_putsn(word, cha_output, word_len); ! field_len -= wl; ! while (field_len-- > 0) ! cha_putc(' ', cha_output); } else { ! field_len -= wl; ! while (field_len-- > 0) ! cha_putc(' ', cha_output); ! field_putsn(word, cha_output, word_len); } } + #undef strtoi(s, i) + #undef field_putsn((w), (o), (l)) ! static int ! get_deli_right(int c) { switch (c) { ! case '(': ! return ')'; ! case '{': ! return '}'; ! case '[': ! return ']'; ! case '<': ! return '>'; ! default: ! return c; } } ! static void ! print_anno(int path_num, char *format) { path_t *path = &Cha_path[path_num]; mrph2_t mrph; int start, end; ! if (!Cha_anno_info[0].hinsi && !Cha_anno_info[1].hinsi ! && !Cha_anno_info[1].format) ! return; if (path->start <= pos_end) { pos_end = path->end; *************** *** 522,533 **** end = path->end; while (start > pos_end) { ! int anno_no = -char_type[pos_end]; char *format_string = format; ! if (anno_no >=0 && ! (Cha_anno_info[anno_no].hinsi || Cha_anno_info[anno_no].format)) { ! mrph.midasi = sentence + pos_end; ! mrph.base_length = mrph.length = undefword_len[pos_end]; mrph.yomi = ""; mrph.base = ""; mrph.pron = ""; --- 546,559 ---- end = path->end; while (start > pos_end) { ! int anno_no = cha_tok_anno_type(Cha_tokenizer, pos_end); char *format_string = format; ! if (anno_no >= 0 && ! (Cha_anno_info[anno_no].hinsi ! || Cha_anno_info[anno_no].format)) { ! mrph.midasi = Cha_tokenizer->string + pos_end; ! mrph.base_length = ! mrph.length = cha_tok_char_type_len(Cha_tokenizer, pos_end); mrph.yomi = ""; mrph.base = ""; mrph.pron = ""; *************** *** 548,619 **** path->end = pos_end + mrph.length; cha_printf_mrph(path_num, &mrph, format_string); } ! pos_end += undefword_len[pos_end]; } path->end = pos_end = end; path->start = start; } ! static void extract_yomi1(char *dst, char *src) { int in_brace = 0, is1st = 0; char *s, *d; if (strchr(src, '{') == NULL) { if (dst != src) ! strcpy(dst, src); return; } for (s = src, d = dst; *s; s++) { if (!in_brace) { if (*s == '{') ! in_brace = is1st = 1; else ! *d++ = *s; ! } else if (*s == '}') ! in_brace = 0; ! else if (is1st) { ! if (*s == '/') ! is1st = 0; ! else ! *d++ = *s; ! } } *d = '\0'; } ! void cha_printf_mrph(int path_num, mrph2_t *mrph, char *format) { ! int letter, value, n, state; ! int deli_left = 0, deli_right = 0; char *s, *word, *eword; char word_str[CHA_INPUT_SIZE], word_str2[CHA_INPUT_SIZE]; char *width_str; path_t *path = &Cha_path[path_num]; ! eword = NULL; /* string in EUC */ ! word = NULL; /* string in EUC(UNIX) or SJIS(Win) */ ! letter = 0; /* character */ ! value = INT_MAX; /* integer value */ state = 0; for (s = format; *s; s++) { - #if 0 - /* escape characters */ - if (*s == '\\') { - if (s[1]) - cha_putc(*++s, cha_output); - continue; - } - #endif if (state == 1 && *s == deli_right) { if (deli_right != deli_left && !*s++) ! return; deli_right = get_deli_right(*s); if ((s = strchr(++s, deli_right)) == NULL) ! return; state = 0; continue; } --- 574,639 ---- path->end = pos_end + mrph.length; cha_printf_mrph(path_num, &mrph, format_string); } ! pos_end += cha_tok_char_type_len(Cha_tokenizer, pos_end); } path->end = pos_end = end; path->start = start; } ! static void ! extract_yomi1(char *dst, char *src) { int in_brace = 0, is1st = 0; char *s, *d; if (strchr(src, '{') == NULL) { if (dst != src) ! strcpy(dst, src); return; } for (s = src, d = dst; *s; s++) { if (!in_brace) { if (*s == '{') ! in_brace = is1st = 1; ! else ! *d++ = *s; ! } else if (*s == '}') ! in_brace = 0; ! else if (is1st) { ! if (*s == '/') ! is1st = 0; else ! *d++ = *s; ! } } *d = '\0'; } ! void ! cha_printf_mrph(int path_num, mrph2_t * mrph, char *format) { ! int letter, value, n, state; ! int deli_left = 0, deli_right = 0; char *s, *word, *eword; char word_str[CHA_INPUT_SIZE], word_str2[CHA_INPUT_SIZE]; char *width_str; path_t *path = &Cha_path[path_num]; ! eword = NULL; /* string in EUC */ ! word = NULL; /* string in EUC(UNIX) or SJIS(Win) */ ! letter = 0; /* character */ ! value = INT_MAX; /* integer value */ state = 0; for (s = format; *s; s++) { if (state == 1 && *s == deli_right) { if (deli_right != deli_left && !*s++) ! return; deli_right = get_deli_right(*s); if ((s = strchr(++s, deli_right)) == NULL) ! return; state = 0; continue; } *************** *** 632,745 **** if (*s == '-' || *s == '.' || (*s >= '0' && *s <= '9')) { width_str = s; while (*s == '-' || *s == '.' || (*s >= '0' && *s <= '9')) ! s++; } switch (*s) { ! case '?': if (!*++s) ! return; state = 2; switch (*s) { ! case 'U': if (mrph->is_undef) ! state = 1; break; ! case 'B': if (Cha_hinsi[mrph->hinsi].depth > 1) ! state = 1; break; ! case 'I': if (mrph->info[0] && strcmp(mrph->info, "NIL")) ! state = 1; break; ! case 'T': ! case 'F': if (mrph->kform) ! state = 1; break; } if (!*++s) ! return; deli_right = get_deli_right(deli_left = *s); if (state == 2) { if ((s = strchr(++s, deli_right)) == NULL) - return; - if (deli_left != deli_right) - if (!*++s) return; deli_right = get_deli_right(*s); } continue; ! case 'U': if (mrph->is_undef) { ! state = 1; deli_right = *s; deli_left = '\0'; s--; ! word = Cha_lang_e ? ESTR_UNKNOWN_WORD : JSTR_UNKNOWN_WORD; } else { ! state = 2; deli_right = get_deli_right(deli_left = *++s); } break; ! case 'm': /* 見出し(活用形) */ ! /* bunsetsu */ ! if (mrph->length == 0) ! word = mrph->midasi; ! else { memcpy(eword = word_str, mrph->midasi, mrph->length); word_str[mrph->length] = '\0'; } break; ! case 'M': /* 見出し(基本形) */ ! /* bunsetsu */ ! if (mrph->length == 0) ! word = mrph->midasi; ! else if (mrph->base[0]) ! eword = mrph->base; else { memcpy(eword = word_str, mrph->midasi, mrph->base_length); if (!mrph->ktype) ! word_str[mrph->base_length] = '\0'; else ! strcpy(word_str + mrph->base_length, ! Cha_form[mrph->ktype][Cha_type[mrph->ktype].basic].gobi); } break; ! case 'y': /* 読み */ ! case 'Y': /* 読み(基本形) */ ! case 'r': ! /* bunsetsu */ ! if (mrph->length == 0) ! word = mrph->midasi; ! else { if (mrph->yomi[0]) { if (s[0] != 'r' && s[1] != '0') ! extract_yomi1(word_str, mrph->yomi); else ! strcpy(word_str, mrph->yomi); } else { memcpy(word_str, mrph->midasi, mrph->base_length); word_str[mrph->base_length] = '\0'; } if (mrph->ktype > 0) { if (*s != 'Y') ! strcat(word_str, Cha_form[mrph->ktype][mrph->kform].ygobi); else ! strcat(word_str, ! Cha_form[mrph->ktype][Cha_type[mrph->ktype].basic].ygobi); } eword = word_str; } if (*s != 'r') { if (s[1] == '0' || s[1] == '1') ! s++; break; } - #if 1 if (!s[1] || !s[2] || !s[3]) ! cha_putc(*s, cha_output); else { extract_yomi1(word_str2, eword); eword = word_str2; --- 652,765 ---- if (*s == '-' || *s == '.' || (*s >= '0' && *s <= '9')) { width_str = s; while (*s == '-' || *s == '.' || (*s >= '0' && *s <= '9')) ! s++; } switch (*s) { ! case '?': if (!*++s) ! return; state = 2; switch (*s) { ! case 'U': if (mrph->is_undef) ! state = 1; break; ! case 'B': if (Cha_hinsi[mrph->hinsi].depth > 1) ! state = 1; break; ! case 'I': if (mrph->info[0] && strcmp(mrph->info, "NIL")) ! state = 1; break; ! case 'T': ! case 'F': if (mrph->kform) ! state = 1; break; } if (!*++s) ! return; deli_right = get_deli_right(deli_left = *s); if (state == 2) { if ((s = strchr(++s, deli_right)) == NULL) return; + if (deli_left != deli_right) + if (!*++s) + return; deli_right = get_deli_right(*s); } continue; ! case 'U': if (mrph->is_undef) { ! state = 1; deli_right = *s; deli_left = '\0'; s--; ! word = (Cha_lang == CHASEN_LANG_EN) ? ! ESTR_UNKNOWN_WORD : JSTR_UNKNOWN_WORD; } else { ! state = 2; deli_right = get_deli_right(deli_left = *++s); } break; ! case 'm': /* Surface string (surface form) */ ! if (mrph->length == 0) /* bunsetsu */ ! word = mrph->midasi; ! else { /* not bunsetsu */ memcpy(eword = word_str, mrph->midasi, mrph->length); word_str[mrph->length] = '\0'; } break; ! case 'M': /* Surface string (base form) */ ! if (mrph->length == 0) /* bunsetsu */ ! word = mrph->midasi; ! else if (mrph->base[0]) /* not bunsetsu */ ! eword = mrph->base; else { memcpy(eword = word_str, mrph->midasi, mrph->base_length); if (!mrph->ktype) ! word_str[mrph->base_length] = '\0'; else ! strcpy(word_str + mrph->base_length, ! Cha_form[mrph->ktype][Cha_type[mrph->ktype]. ! basic].gobi); } break; ! case 'y': /* Japanese Reading (surface form) */ ! case 'Y': /* Japanese Reading (base form) */ ! case 'r': ! if (mrph->length == 0) /* bunsetsu */ ! word = mrph->midasi; ! else { /* not bunsetsu */ if (mrph->yomi[0]) { if (s[0] != 'r' && s[1] != '0') ! extract_yomi1(word_str, mrph->yomi); else ! strcpy(word_str, mrph->yomi); } else { memcpy(word_str, mrph->midasi, mrph->base_length); word_str[mrph->base_length] = '\0'; } if (mrph->ktype > 0) { if (*s != 'Y') ! strcat(word_str, ! Cha_form[mrph->ktype][mrph->kform].ygobi); else ! strcat(word_str, ! Cha_form[mrph->ktype][Cha_type[mrph->ktype]. ! basic].ygobi); } eword = word_str; } if (*s != 'r') { if (s[1] == '0' || s[1] == '1') ! s++; break; } if (!s[1] || !s[2] || !s[3]) ! cha_putc(*s, cha_output); else { extract_yomi1(word_str2, eword); eword = word_str2; *************** *** 754,906 **** s += 3; } break; ! #endif ! case 'a': /* 発音 */ ! case 'A': /* 発音(基本形) */ ! /* bunsetsu */ ! if (mrph->length == 0) ! word = mrph->midasi; ! else { if (mrph->pron[0]) { if (s[1] != '0') ! extract_yomi1(word_str, mrph->pron); else ! strcpy(word_str, mrph->pron); } else if (mrph->yomi[0]) { if (s[1] != '0') ! extract_yomi1(word_str, mrph->yomi); else ! strcpy(word_str, mrph->yomi); } else { memcpy(word_str, mrph->midasi, mrph->base_length); word_str[mrph->base_length] = 0; } if (mrph->ktype > 0) { if (*s != 'A') ! strcat(word_str, Cha_form[mrph->ktype][mrph->kform].pgobi); else ! strcat(word_str, ! Cha_form[mrph->ktype][Cha_type[mrph->ktype].basic].pgobi); } eword = word_str; } if (s[1] == '0' || s[1] == '1') ! s++; break; ! case 'i': /* 付加情報 */ if (s[1] != '0') ! extract_yomi1(word_str, mrph->info); else ! strcpy(word_str, mrph->info); eword = word_str; break; ! case 'I': /* 付加情報 */ if (*++s == '\0') ! cha_putc(*--s, cha_output); else if (mrph->info[0] && strcmp(mrph->info, "NIL")) ! eword = mrph->info; else if (*s != ' ') ! letter = *s; break; ! case 'P': ! n = 99; /* すべての階層を表示 */ if (s[1] >= '1' && s[1] <= '9') ! n = *++s - '1'; if (s[1] == '\0') ! cha_putc(*s, cha_output); else ! print_nhinsi(mrph->hinsi, *++s, n); break; ! case 'h': /* 品詞(番号) */ value = mrph->hinsi; break; ! case 'H': /* 品詞(文字列) */ if (s[1] < '1' || s[1] > '9') ! n = 0; else { n = *++s - '1'; if (Cha_hinsi[mrph->hinsi].depth - 1 < n) ! n = Cha_hinsi[mrph->hinsi].depth - 1; } word = Cha_hinsi[Cha_hinsi[mrph->hinsi].path[n]].name; break; ! case 'b': /* 品詞細分類(番号) */ value = 0; break; ! case 'B': /* 品詞細分類(文字列) */ if (s[1] == '\0') ! cha_putc(*s, cha_output); else if (*++s == 'M' && mrph->is_undef) ! word = Cha_lang_e ? ESTR_UNKNOWN_WORD : JSTR_UNKNOWN_WORD; ! /* 階層化品詞なら一番下の階層の品詞名を表示 */ ! else if (*s == 'M' || *s == 'B' || Cha_hinsi[mrph->hinsi].depth > 1) ! word = Cha_hinsi[mrph->hinsi].name; else if (*s != ' ') ! letter = *s; break; ! case 't': /* 活用型(番号) */ value = mrph->ktype; break; ! case 'T': /* 活用型(文字列) */ if (*++s == '\0') ! cha_putc(*--s, cha_output); else if (mrph->ktype) ! word = Cha_type[mrph->ktype].name; else if (*s != ' ') ! letter = *s; break; ! case 'f': /* 活用形(番号) */ value = mrph->kform; break; ! case 'F': /* 活用形(文字列) */ if (*++s == '\0') ! cha_putc(*--s, cha_output); if (mrph->kform) ! word = Cha_form[mrph->ktype][mrph->kform].name; else if (*s != ' ') ! letter = *s; break; ! case 'c': /* 形態素のコスト */ if (mrph->is_undef) { ! value = Cha_undef_info[mrph->is_undef-1].cost ! + Cha_undef_info[mrph->is_undef-1].cost_step * mrph->length / 2; } else { value = Cha_hinsi[mrph->hinsi].cost; } value *= mrph->weight * Cha_mrph_cost_weight; break; ! case 'S': /* 解析文全体 */ ! word = sentence; break; ! case 'p': /* path に関する情報 */ if (s[1] == '\0') { cha_putc(*s, cha_output); break; } switch (*++s) { ! case 'i': value = path_num; break; ! case 's': value = path->start; break; ! case 'e': value = path->end; break; ! case 'c': value = path->cost; break; ! case 'b': letter = path->do_print == 2 ? '*' : ' '; break; ! case 'p': if ((s[1] != 'i' && s[1] != 'c') || s[2] == '\0') ! cha_putc(*s, cha_output); else if (*++s == 'i') { int c = *++s, j; for (j = 0; path->path[j] != -1; j++) { if (j) ! cha_putc(c, cha_output); cha_printf(cha_output, "%d", path->path[j]); } } else { --- 774,933 ---- s += 3; } break; ! case 'a': /* Japanese pronunciation (surface form) */ ! case 'A': /* Japanese pronunciation (base form) */ ! if (mrph->length == 0) /* bunsetsu */ ! word = mrph->midasi; ! else { /* not bunsetsu */ if (mrph->pron[0]) { if (s[1] != '0') ! extract_yomi1(word_str, mrph->pron); else ! strcpy(word_str, mrph->pron); } else if (mrph->yomi[0]) { if (s[1] != '0') ! extract_yomi1(word_str, mrph->yomi); else ! strcpy(word_str, mrph->yomi); } else { memcpy(word_str, mrph->midasi, mrph->base_length); word_str[mrph->base_length] = 0; } if (mrph->ktype > 0) { if (*s != 'A') ! strcat(word_str, ! Cha_form[mrph->ktype][mrph->kform].pgobi); else ! strcat(word_str, ! Cha_form[mrph->ktype][Cha_type[mrph->ktype]. ! basic].pgobi); } eword = word_str; } if (s[1] == '0' || s[1] == '1') ! s++; break; ! case 'i': /* information */ if (s[1] != '0') ! extract_yomi1(word_str, mrph->info); else ! strcpy(word_str, mrph->info); eword = word_str; break; ! case 'I': /* information */ if (*++s == '\0') ! cha_putc(*--s, cha_output); else if (mrph->info[0] && strcmp(mrph->info, "NIL")) ! eword = mrph->info; else if (*s != ' ') ! letter = *s; break; ! case 'P': ! n = 99; /* print all level of the POS -- すべての階層を表示 */ if (s[1] >= '1' && s[1] <= '9') ! n = *++s - '1'; if (s[1] == '\0') ! cha_putc(*s, cha_output); else ! print_nhinsi(mrph->hinsi, *++s, n); break; ! case 'h': /* POS number */ value = mrph->hinsi; break; ! case 'H': /* POS string */ if (s[1] < '1' || s[1] > '9') ! n = 0; else { n = *++s - '1'; if (Cha_hinsi[mrph->hinsi].depth - 1 < n) ! n = Cha_hinsi[mrph->hinsi].depth - 1; } word = Cha_hinsi[Cha_hinsi[mrph->hinsi].path[n]].name; break; ! case 'b': /* POS subdivision number */ value = 0; break; ! case 'B': /* POS subdivision string */ if (s[1] == '\0') ! cha_putc(*s, cha_output); else if (*++s == 'M' && mrph->is_undef) ! word = (Cha_lang == CHASEN_LANG_EN) ? ! ESTR_UNKNOWN_WORD : JSTR_UNKNOWN_WORD; ! /* ! * 階層化品詞なら一番下の階層の品詞名を表示 ! * when the POS has subdivision level, ! * print the lowest level of the POS name ! */ ! else if (*s == 'M' || *s == 'B' ! || Cha_hinsi[mrph->hinsi].depth > 1) ! word = Cha_hinsi[mrph->hinsi].name; else if (*s != ' ') ! letter = *s; break; ! case 't': /* Conjugation type number */ value = mrph->ktype; break; ! case 'T': /* Conjugation type string */ if (*++s == '\0') ! cha_putc(*--s, cha_output); else if (mrph->ktype) ! word = Cha_type[mrph->ktype].name; else if (*s != ' ') ! letter = *s; break; ! case 'f': /* Conjugation form number */ value = mrph->kform; break; ! case 'F': /* Conjugation form string */ if (*++s == '\0') ! cha_putc(*--s, cha_output); if (mrph->kform) ! word = Cha_form[mrph->ktype][mrph->kform].name; else if (*s != ' ') ! letter = *s; break; ! case 'c': /* the cost of morpheme */ if (mrph->is_undef) { ! value = Cha_undef_info[mrph->is_undef - 1].cost ! + Cha_undef_info[mrph->is_undef - ! 1].cost_step * mrph->length / 2; } else { value = Cha_hinsi[mrph->hinsi].cost; } value *= mrph->weight * Cha_mrph_cost_weight; break; ! case 'S': /* entire sentence */ ! word = Cha_tokenizer->string; break; ! case 'p': /* the information about path */ if (s[1] == '\0') { cha_putc(*s, cha_output); break; } switch (*++s) { ! case 'i': value = path_num; break; ! case 's': value = path->start; break; ! case 'e': value = path->end; break; ! case 'c': value = path->cost; break; ! case 'b': letter = path->do_print == 2 ? '*' : ' '; break; ! case 'p': if ((s[1] != 'i' && s[1] != 'c') || s[2] == '\0') ! cha_putc(*s, cha_output); else if (*++s == 'i') { int c = *++s, j; for (j = 0; path->path[j] != -1; j++) { if (j) ! cha_putc(c, cha_output); cha_printf(cha_output, "%d", path->path[j]); } } else { *************** *** 908,924 **** int c = *++s, j; for (j = 0; path->path[j] != -1; j++) { if (j) ! cha_putc(c, cha_output); cha_printf(cha_output, "%d", Cha_con_cost_weight * ! check_con_cost(&Cha_path[path->path[j]], con_tbl)); } ! } break; } break; ! case '\0': return; ! default: /* includes '%' */ cha_putc(*s, cha_output); continue; } --- 935,952 ---- int c = *++s, j; for (j = 0; path->path[j] != -1; j++) { if (j) ! cha_putc(c, cha_output); cha_printf(cha_output, "%d", Cha_con_cost_weight * ! check_con_cost(&Cha_path[path->path[j]], ! con_tbl)); } ! } break; } break; ! case '\0': return; ! default: /* includes '%' */ cha_putc(*s, cha_output); continue; } *************** *** 945,1003 **** value = INT_MAX; } } - - return; - - #if 0 - error_end: - cha_exit(1, "format error near the point %d in option -- F", s - format + 1); - #endif } ! static void print_bos_eos(char *str) { char *s; for (s = str; *s; s++) { if (*s == '%' && *++s == 'S') ! cha_puts(sentence, cha_output); else ! cha_putc(*s, cha_output); } ! } ! static void print_bos(int opt_form) { if (opt_form != 'W' && opt_form != 'd' && *Cha_bos_string) ! print_bos_eos(Cha_bos_string); } ! static void print_eos(int opt_form) { if (opt_form == 'W') ! cha_putc('\n', cha_output); else if (opt_form != 'd' && *Cha_eos_string) ! print_bos_eos(Cha_eos_string); } /* * print_path_mrph */ ! static void print_mrph(int path_num, mrph2_t *mrph, char *format) { print_anno(path_num, format); ! if (Cha_output_compo || *mrph->comp == '\n') ! cha_printf_mrph(path_num, mrph, format); ! else { ! /* compound word */ int kform = mrph->kform; ! while (*mrph->comp != '\n') { ! cha_get_mrph_data(mrph, mrph->comp, mrph->midasi); ! /* 最後の形態素の活用形=複合語の活用形 */ ! if (*mrph->comp == '\n' && !mrph->kform) ! mrph->kform = kform; if (mrph->ktype) { ! mrph->length += strlen(Cha_form[mrph->ktype][mrph->kform].gobi); mrph->con_tbl += mrph->kform - 1; } cha_printf_mrph(path_num, mrph, format); --- 973,1036 ---- value = INT_MAX; } } } ! static void ! print_bos_eos(char *str) { char *s; for (s = str; *s; s++) { if (*s == '%' && *++s == 'S') ! cha_puts(Cha_tokenizer->string, cha_output); else ! cha_putc(*s, cha_output); } ! } ! static void ! print_bos(int opt_form) { if (opt_form != 'W' && opt_form != 'd' && *Cha_bos_string) ! print_bos_eos(Cha_bos_string); } ! static void ! print_eos(int opt_form) { if (opt_form == 'W') ! cha_putc('\n', cha_output); else if (opt_form != 'd' && *Cha_eos_string) ! print_bos_eos(Cha_eos_string); } /* * print_path_mrph */ ! static void ! print_mrph(int path_num, mrph2_t * mrph, char *format) { print_anno(path_num, format); ! ! if (Cha_output_iscompound || ! mrph->compound == NULL || ! *mrph->compound == '\n') { ! cha_printf_mrph(path_num, mrph, format); ! } else { ! /* ! * compound word ! */ int kform = mrph->kform; ! while (*mrph->compound != '\n') { ! cha_get_mrph_data(mrph, mrph->compound, mrph->midasi); ! /* ! * 最後の形態素の活用形=複合語の活用形 ! */ ! if (*mrph->compound == '\n' && !mrph->kform) ! mrph->kform = kform; if (mrph->ktype) { ! mrph->length += ! strlen(Cha_form[mrph->ktype][mrph->kform].gobi); mrph->con_tbl += mrph->kform - 1; } cha_printf_mrph(path_num, mrph, format); *************** *** 1006,1160 **** } } ! static void print_path_mrph(int path_num, char *format) { print_mrph(path_num, &Cha_mrph[Cha_path[path_num].mrph_p], format); } /* * print_best_path() */ ! static void print_best_path(int opt_form, char *format) { int i, last, pbuf_last, isfirst = 1; ! int path_num_comp = 0; char yomi[CHA_INPUT_SIZE]; char pron[CHA_INPUT_SIZE]; char base[CHA_INPUT_SIZE]; ! mrph2_t mrph, *mrph1, *mrph2; print_bos(opt_form); last = Cha_path[Cha_path_num - 1].path[0]; - if (last) { - for (pbuf_last = 0; last; last = Cha_path[last].path[0], pbuf_last++) { - path_buffer[pbuf_last] = last; - #if 0 - printf("# last,path0: %d,%d\n",last,Cha_path[last].path[0]);fflush(stdout); - #endif - } ! /* 連結品詞を一単語に連結して表示 */ ! mrph.hinsi = 0; ! mrph.yomi = yomi; ! mrph.base = base; ! mrph.pron = pron; ! mrph1 = &Cha_mrph[Cha_path[path_buffer[pbuf_last - 1]].mrph_p]; ! for (i = pbuf_last - 1; i >= 0; i--) { ! mrph2 = i == 0 ? NULL : &Cha_mrph[Cha_path[path_buffer[i - 1]].mrph_p]; ! if (i > 0 && ! !mrph1->is_undef && !mrph2->is_undef && ! Cha_path[path_buffer[i]].end == Cha_path[path_buffer[i - 1]].start && ! Cha_hinsi[mrph1->hinsi].comp && ! Cha_hinsi[mrph1->hinsi].comp == Cha_hinsi[mrph2->hinsi].comp) { ! if (!mrph.hinsi) { ! mrph.hinsi = Cha_hinsi[mrph1->hinsi].comp; ! base[0] = '\0'; ! pron[0] = '\0'; ! yomi[0] = '\0'; ! mrph.midasi = mrph1->midasi; ! mrph.length = mrph.weight = 0; ! path_num_comp = path_buffer[i]; ! } ! if (mrph1->yomi[0]) ! strcat(yomi, mrph1->yomi); ! else { ! int len = strlen(yomi); ! memcpy(yomi + len, mrph1->midasi, mrph1->base_length); ! yomi[len + mrph1->base_length] = '\0'; ! } ! if (mrph1->ktype > 0) ! strcat(yomi, Cha_form[mrph1->ktype][mrph1->kform].ygobi); ! strcat(base, mrph1->base); ! if (mrph1->pron[0]) ! strcat(pron, mrph1->pron); ! else if (mrph1->yomi[0]) ! strcat(pron, mrph1->yomi); ! else { ! int len = strlen(pron); ! memcpy(pron + len, mrph1->midasi, mrph1->base_length); ! pron[len + mrph1->base_length] = '\0'; ! } ! if (mrph1->ktype > 0) ! strcat(pron, Cha_form[mrph1->ktype][mrph1->kform].pgobi); ! mrph.length += mrph1->length; ! mrph.weight += mrph1->weight; ! } else { ! if (opt_form == 'd') { ! if (isfirst) ! isfirst = 0; ! else ! cha_putc(',', cha_output); ! } ! if (!mrph.hinsi) ! print_mrph(path_buffer[i], mrph1, format); ! else { ! if (mrph1->yomi[0]) ! strcat(yomi, mrph1->yomi); ! else { ! int len = strlen(yomi); ! memcpy(yomi + len, mrph1->midasi, mrph1->base_length); ! yomi[len + mrph1->base_length] = '\0'; ! } ! strcat(base, mrph1->base); ! if (mrph1->pron[0]) ! strcat(pron, mrph1->pron); ! else if (mrph1->yomi[0]) ! strcat(pron, mrph1->yomi); ! else { ! int len = strlen(pron); ! memcpy(pron + len, mrph1->midasi, mrph1->base_length); ! pron[len + mrph1->base_length] = '\0'; ! } ! mrph.base_length = mrph.length + mrph1->base_length; ! mrph.length += mrph1->length; ! mrph.weight += mrph1->weight; ! mrph.info = mrph1->info; ! mrph.ktype = mrph1->ktype; ! mrph.kform = mrph1->kform; ! mrph.is_undef = mrph1->is_undef; ! Cha_path[path_num_comp].end = Cha_path[path_num_comp].start + mrph.length; ! print_mrph(path_num_comp, &mrph, format); ! mrph.hinsi = 0; ! } } - mrph1 = mrph2; } } ! print_anno(Cha_path_num - 1, format); ! print_eos(opt_form); } /* * print_all_mrph - 正しい解析結果に含まれる全ての形態素を表示 ! * -m, -d, -v オプションで使用 */ ! static void collect_all_mrph(int path_num) { int i, j; for (i = 0; (j = Cha_path[path_num].path[i]) && j != -1; i++) { if (!Cha_path[j].do_print) { Cha_path[j].do_print = ! (i == 0 && ! (path_num == Cha_path_num - 1 || Cha_path[path_num].do_print == 2)) ? 2 : 1; collect_all_mrph(j); } } } ! static void print_all_mrph(int opt_form, char *format) { int i; ! int isfirst = 1; /* 文頭かどうかのフラグ for -d option */ for (i = 0; i < Cha_path_num; i++) ! Cha_path[i].do_print = 0; collect_all_mrph(Cha_path_num - 1); ! /* -v のときは文頭・文末の情報も表示 */ if (opt_form == 'v') { Cha_path[0].do_print = 2; Cha_path[Cha_path_num - 1].do_print = 2; --- 1039,1247 ---- } } ! static void ! print_path_mrph(int path_num, char *format) { print_mrph(path_num, &Cha_mrph[Cha_path[path_num].mrph_p], format); } + static void + concat_composit_mrph(mrph2_t *composit_mrph, mrph2_t *cur_mrph) + { + /* + * initialization + */ + if (!composit_mrph->hinsi) { + composit_mrph->hinsi = Cha_hinsi[cur_mrph->hinsi].composit; + composit_mrph->midasi = cur_mrph->midasi; + composit_mrph->length = composit_mrph->weight = 0; + composit_mrph->yomi[0] = '\0'; + composit_mrph->pron[0] = '\0'; + composit_mrph->base[0] = '\0'; + } + /* + * Japanese Reading + */ + if (cur_mrph->yomi[0]) + strcat(composit_mrph->yomi, cur_mrph->yomi); + else { + int len = strlen(composit_mrph->yomi); + memcpy(composit_mrph->yomi + len, cur_mrph->midasi, cur_mrph->base_length); + composit_mrph->yomi[len + cur_mrph->base_length] = '\0'; + } + if (cur_mrph->ktype > 0) + strcat(composit_mrph->yomi, + Cha_form[cur_mrph->ktype][cur_mrph->kform].ygobi); + /* + * Pronunciation + */ + if (cur_mrph->pron[0]) + strcat(composit_mrph->pron, cur_mrph->pron); + else if (cur_mrph->yomi[0]) + strcat(composit_mrph->pron, cur_mrph->yomi); + else { + int len = strlen(composit_mrph->pron); + memcpy(composit_mrph->pron + len, cur_mrph->midasi, cur_mrph->base_length); + composit_mrph->pron[len + cur_mrph->base_length] = '\0'; + } + if (cur_mrph->ktype > 0) + strcat(composit_mrph->pron, + Cha_form[cur_mrph->ktype][cur_mrph->kform].pgobi); + + strcat(composit_mrph->base, cur_mrph->base); + composit_mrph->length += cur_mrph->length; + composit_mrph->weight += cur_mrph->weight; + } + + static void + concat_composit_mrph_end(mrph2_t *composit_mrph, mrph2_t *cur_mrph) { + /* + * Japanese Reading + */ + if (cur_mrph->yomi[0]) + strcat(composit_mrph->yomi, cur_mrph->yomi); + else { + int len = strlen(composit_mrph->yomi); + memcpy(composit_mrph->yomi + len, cur_mrph->midasi, + cur_mrph->base_length); + composit_mrph->yomi[len + cur_mrph->base_length] = '\0'; + } + /* + * Japanese Pronunciation + */ + if (cur_mrph->pron[0]) + strcat(composit_mrph->pron, cur_mrph->pron); + else if (cur_mrph->yomi[0]) + strcat(composit_mrph->pron, cur_mrph->yomi); + else { + int len = strlen(composit_mrph->pron); + memcpy(composit_mrph->pron + len, cur_mrph->midasi, + cur_mrph->base_length); + composit_mrph->pron[len + cur_mrph->base_length] = '\0'; + } + + strcat(composit_mrph->base, cur_mrph->base); + composit_mrph->base_length = composit_mrph->length + cur_mrph->base_length; + composit_mrph->length += cur_mrph->length; + composit_mrph->weight += cur_mrph->weight; + composit_mrph->info = cur_mrph->info; + composit_mrph->ktype = cur_mrph->ktype; + composit_mrph->kform = cur_mrph->kform; + composit_mrph->is_undef = cur_mrph->is_undef; + } + + #define print_anno_eos() \ + { print_anno(Cha_path_num - 1, format); print_eos(opt_form); } /* * print_best_path() */ ! static void ! print_best_path(int opt_form, char *format) { int i, last, pbuf_last, isfirst = 1; ! int path_num_composit = 0; char yomi[CHA_INPUT_SIZE]; char pron[CHA_INPUT_SIZE]; char base[CHA_INPUT_SIZE]; ! mrph2_t composit_mrph, *cur_mrph, *pre_mrph; print_bos(opt_form); last = Cha_path[Cha_path_num - 1].path[0]; ! if (last == 0) { ! print_anno_eos(); ! return; ! } ! for (pbuf_last = 0; last; last = Cha_path[last].path[0], pbuf_last++) { ! path_buffer[pbuf_last] = last; ! } ! ! /* ! * print composit POSs as one word ! */ ! /* initialization */ ! composit_mrph.hinsi = 0; ! composit_mrph.yomi = yomi; ! composit_mrph.pron = pron; ! composit_mrph.base = base; ! cur_mrph = &Cha_mrph[Cha_path[path_buffer[pbuf_last - 1]].mrph_p]; ! ! /* ! * chunking the composit POSs from EOS to BOS ! */ ! for (i = pbuf_last - 1; i >= 0; i--) { ! ! pre_mrph = (i == 0) ? ! NULL : &Cha_mrph[Cha_path[path_buffer[i - 1]].mrph_p]; ! ! if (i > 0 && !cur_mrph->is_undef && !pre_mrph->is_undef ! && (Cha_path[path_buffer[i]].end == Cha_path[path_buffer[i - 1]].start) ! && Cha_hinsi[cur_mrph->hinsi].composit ! && (Cha_hinsi[cur_mrph->hinsi].composit == Cha_hinsi[pre_mrph->hinsi].composit)) { ! ! if (!composit_mrph.hinsi) ! path_num_composit = path_buffer[i]; ! ! concat_composit_mrph(&composit_mrph, cur_mrph); ! ! } else { ! if (opt_form == 'd') { ! if (isfirst) ! isfirst = 0; ! else ! cha_putc(',', cha_output); ! } ! if (!composit_mrph.hinsi) ! print_mrph(path_buffer[i], cur_mrph, format); ! else { ! concat_composit_mrph_end(&composit_mrph, cur_mrph); ! Cha_path[path_num_composit].end = ! Cha_path[path_num_composit].start + composit_mrph.length; ! print_mrph(path_num_composit, &composit_mrph, format); ! composit_mrph.hinsi = 0; } } + cur_mrph = pre_mrph; } ! print_anno_eos(); } /* * print_all_mrph - 正しい解析結果に含まれる全ての形態素を表示 ! * -m, -d, -v オプションで使用 */ ! static void ! collect_all_mrph(int path_num) { int i, j; for (i = 0; (j = Cha_path[path_num].path[i]) && j != -1; i++) { if (!Cha_path[j].do_print) { Cha_path[j].do_print = ! (i == 0 && ! (path_num == Cha_path_num - 1 ! || Cha_path[path_num].do_print == 2)) ? 2 : 1; collect_all_mrph(j); } } } ! static void ! print_all_mrph(int opt_form, char *format) { int i; ! int isfirst = 1; /* 文頭かどうかのフラグ for -d option */ for (i = 0; i < Cha_path_num; i++) ! Cha_path[i].do_print = 0; collect_all_mrph(Cha_path_num - 1); ! /* ! * -v のときは文頭・文末の情報も表示 ! */ if (opt_form == 'v') { Cha_path[0].do_print = 2; Cha_path[Cha_path_num - 1].do_print = 2; *************** *** 1165,1173 **** if (Cha_path[i].do_print) { if (opt_form == 'd') { if (isfirst) ! isfirst = 0; else ! cha_putc(',', cha_output); } print_path_mrph(i, format); } --- 1252,1260 ---- if (Cha_path[i].do_print) { if (opt_form == 'd') { if (isfirst) ! isfirst = 0; else ! cha_putc(',', cha_output); } print_path_mrph(i, format); } *************** *** 1179,1185 **** /* * print_all_path() */ ! static void print_all_path_sub(int path_num, int paths, int opt_form, char *format) { int i, j; --- 1266,1273 ---- /* * print_all_path() */ ! static void ! print_all_path_sub(int path_num, int paths, int opt_form, char *format) { int i, j; *************** *** 1187,1193 **** if (Cha_path[path_num].path[0] == 0) { pos_end = 0; for (j = paths - 1; j >= 0; j--) ! print_path_mrph(path_buffer[j], format); print_anno(Cha_path_num - 1, format); cha_puts("EOP\n", cha_output); } else { --- 1275,1281 ---- if (Cha_path[path_num].path[0] == 0) { pos_end = 0; for (j = paths - 1; j >= 0; j--) ! print_path_mrph(path_buffer[j], format); print_anno(Cha_path_num - 1, format); cha_puts("EOP\n", cha_output); } else { *************** *** 1198,1233 **** } } ! static void print_all_path(int opt_form, char *format) { print_bos(opt_form); print_all_path_sub(Cha_path_num - 1, 0, opt_form, format); print_eos(opt_form); } ! void cha_print_path(int opt_show, int opt_form, char *format) { if (opt_form == 'd') ! cha_putc('[', cha_output); switch (opt_show) { ! case 'm': print_all_mrph(opt_form, format); break; ! case 'p': print_all_path(opt_form, format); break; ! default: print_best_path(opt_form, format); /* 'b' */ } if (opt_form == 'd') ! cha_puts("].\n", cha_output); } ! void cha_print_bos_eos(int opt_form) { ! cha_set_sentence("", NULL, NULL); print_bos(opt_form); print_eos(opt_form); } ! void cha_print_hinsi_table(void) { int i; --- 1286,1330 ---- } } ! static void ! print_all_path(int opt_form, char *format) { print_bos(opt_form); print_all_path_sub(Cha_path_num - 1, 0, opt_form, format); print_eos(opt_form); } ! void ! cha_print_path(int opt_show, int opt_form, char *format) { if (opt_form == 'd') ! cha_putc('[', cha_output); switch (opt_show) { ! case 'm': ! print_all_mrph(opt_form, format); ! break; ! case 'p': ! print_all_path(opt_form, format); ! break; ! default: ! print_best_path(opt_form, format); /* 'b' */ } if (opt_form == 'd') ! cha_puts("].\n", cha_output); } ! void ! cha_print_bos_eos(int opt_form) { ! pos_end = 0; print_bos(opt_form); print_eos(opt_form); } ! void ! cha_print_hinsi_table(void) { int i; *************** *** 1238,1254 **** } } ! void cha_print_ctype_table(void) { int i; for (i = 1; Cha_type[i].name; i++) ! cha_printf(cha_output, "%d %s\n", i, Cha_type[i].name); } ! void cha_print_cform_table(void) { int i, j; for (i = 1; Cha_type[i].name; i++) ! for (j = 1; Cha_form[i][j].name; j++) ! printf("%d %d %s\n", i, j, Cha_form[i][j].name); } --- 1335,1353 ---- } } ! void ! cha_print_ctype_table(void) { int i; for (i = 1; Cha_type[i].name; i++) ! cha_printf(cha_output, "%d %s\n", i, Cha_type[i].name); } ! void ! cha_print_cform_table(void) { int i, j; for (i = 1; Cha_type[i].name; i++) ! for (j = 1; Cha_form[i][j].name; j++) ! printf("%d %d %s\n", i, j, Cha_form[i][j].name); } diff -crN chasen-2.2.3/lib/select.c chasen-2.2.4/lib/select.c *** chasen-2.2.3/lib/select.c Fri Feb 23 22:07:14 2001 --- chasen-2.2.4/lib/select.c Fri Mar 16 13:06:17 2001 *************** *** 35,59 **** * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ! * $Id: select.c,v 1.12 2001/02/23 13:07:14 kazuma-t Exp $ */ #include #include #include #include "sufary.h" #define MINMIN(X,Y) ((X) < (Y) ? (X) : (Y)) ! /* sistring の比較 */ ! static int cmp_sistr(char*, char*, int*, int); ! static eresult sa_search(SUFARY*, char*, int, int); #define sa_aryidx2txtptr(ary, idx) \ ! ((ary)->txtmap + ntohl(((long *)((ary)->arymap + (idx) * sizeof(long)))[0])) ! ! /****************************************************************************** * eresult sa_search(SUFARY *ary, char *s, int keylen, int base_offset); * * purpose --- 35,62 ---- * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * ! * $Id: select.c,v 1.16 2001/03/16 04:06:17 kazuma-t Exp $ */ #include #include #include #include "sufary.h" + #include "tokenizer.h" + #include "htobe.h" #define MINMIN(X,Y) ((X) < (Y) ? (X) : (Y)) ! /* ! * sistring の比較 ! */ ! static int cmp_sistr(char *, char *, int *, int); ! static eresult sa_search(SUFARY *, char *, int, int); #define sa_aryidx2txtptr(ary, idx) \ ! ((ary)->txtmap + betoh(((long *)((ary)->arymap + (idx) * sizeof(long)))[0])) ! /* * eresult sa_search(SUFARY *ary, char *s, int keylen, int base_offset); * * purpose *************** *** 87,173 **** * ary->ee : エラーコード(どんなエラーが発生したか) * ary->right, ary->left : 自動的に次の検索範囲を狭める * ! *****************************************************************************/ ! static eresult sa_search(SUFARY *ary, char *s, int keylen, int base_offset) { long left_outside, right_outside, left_inside, right_inside, cur, tmp; int hr; int prefix_length_L = base_offset; int prefix_length_R = base_offset; ! int offset = 0, diffpos /* 異なり位置 */; if (ary == NULL || ary->arymap == NULL) { ! fprintf (stderr, "specify target files first.\n"); return FAIL; } ! /* 検索範囲初期設定 */ right_outside = ary->right + 1; left_outside = ary->left - 1; right_inside = ary->right; left_inside = ary->left; ! /* step 1. Match する点を見つける。*/ ! cur = (right_outside - left_outside)/2 + left_outside; while (1) { offset = MINMIN(prefix_length_L, prefix_length_R); hr = cmp_sistr(sa_aryidx2txtptr(ary, cur) + offset, s + offset, &diffpos, keylen - offset); ! if (hr < 0){ /* LESS */ left_outside = cur; prefix_length_L = offset + diffpos; ! } else if (hr > 0){ /* ABOVE */ right_outside = cur; prefix_length_R = offset + diffpos; ! } else { /* MATCH ... if (hr == 0) */ left_inside = right_inside = cur; break; } ! tmp = (right_outside - left_outside)/2 + left_outside; ! /* left_outside は -1 の可能性あり。 ! ∴ tmp も -1 になることがある 980319 */ if (cur == tmp || tmp < ary->left) ! return FAIL; /* 見つからなかった... */ cur = tmp; } ! /* step 2. right_inside を確定する */ offset = prefix_length_R; ! cur = (right_outside - right_inside)/2 + right_inside; while (1) { hr = cmp_sistr(sa_aryidx2txtptr(ary, cur) + offset, s + offset, &diffpos, keylen - offset); ! if (hr > 0){ /* ABOVE */ right_outside = cur; offset += diffpos; ! } else if (hr == 0){ /* MATCH */ right_inside = cur; ! } else { /* LESS ... if (hr < 0) */ ary->ee = STRUCTURE; ! return ERROR; } ! tmp = (right_outside - right_inside)/2 + right_inside; ! if (cur==tmp) break; cur = tmp; } ! /* step 3. left_inside を確定する */ offset = prefix_length_L; ! cur = left_inside - (left_inside - left_outside)/2; /* 980319 */ ! if (cur < 0) cur = 0; while (1) { hr = cmp_sistr(sa_aryidx2txtptr(ary, cur) + offset, s + offset, &diffpos, keylen - offset); ! if (hr < 0){ /* LESS */ left_outside = cur; offset += diffpos; ! } else if (hr == 0){ /* MATCH */ left_inside = cur; ! } else { /* ABOVE ... if (hr > 0) */ ary->ee = STRUCTURE; ! return ERROR; } ! tmp = left_inside - (left_inside - left_outside)/2; /* 980319 */ if (tmp < 0) tmp = 0; if (cur == tmp) --- 90,189 ---- * ary->ee : エラーコード(どんなエラーが発生したか) * ary->right, ary->left : 自動的に次の検索範囲を狭める * ! */ ! static eresult ! sa_search(SUFARY * ary, char *s, int keylen, int base_offset) { long left_outside, right_outside, left_inside, right_inside, cur, tmp; int hr; int prefix_length_L = base_offset; int prefix_length_R = base_offset; ! int offset = 0, diffpos /* 異なり位置 */ ; if (ary == NULL || ary->arymap == NULL) { ! fprintf(stderr, "specify target files first.\n"); return FAIL; } ! /* ! * 検索範囲初期設定 ! */ right_outside = ary->right + 1; left_outside = ary->left - 1; right_inside = ary->right; left_inside = ary->left; ! /* ! * step 1. Match する点を見つける。 ! */ ! cur = (right_outside - left_outside) / 2 + left_outside; while (1) { offset = MINMIN(prefix_length_L, prefix_length_R); hr = cmp_sistr(sa_aryidx2txtptr(ary, cur) + offset, s + offset, &diffpos, keylen - offset); ! if (hr < 0) { /* LESS */ left_outside = cur; prefix_length_L = offset + diffpos; ! } else if (hr > 0) { /* ABOVE */ right_outside = cur; prefix_length_R = offset + diffpos; ! } else { /* MATCH ... if (hr == 0) */ left_inside = right_inside = cur; break; } ! tmp = (right_outside - left_outside) / 2 + left_outside; ! /* ! * left_outside は -1 の可能性あり。 ∴ tmp も -1 になることがある ! * 980319 ! */ if (cur == tmp || tmp < ary->left) ! return FAIL; /* 見つからなかった... */ cur = tmp; } ! /* ! * step 2. right_inside を確定する ! */ offset = prefix_length_R; ! cur = (right_outside - right_inside) / 2 + right_inside; while (1) { hr = cmp_sistr(sa_aryidx2txtptr(ary, cur) + offset, s + offset, &diffpos, keylen - offset); ! if (hr > 0) { /* ABOVE */ right_outside = cur; offset += diffpos; ! } else if (hr == 0) { /* MATCH */ right_inside = cur; ! } else { /* LESS ... if (hr < 0) */ ary->ee = STRUCTURE; ! return _ERROR; } ! tmp = (right_outside - right_inside) / 2 + right_inside; ! if (cur == tmp) ! break; cur = tmp; } ! /* ! * step 3. left_inside を確定する ! */ offset = prefix_length_L; ! cur = left_inside - (left_inside - left_outside) / 2; /* 980319 */ ! if (cur < 0) ! cur = 0; while (1) { hr = cmp_sistr(sa_aryidx2txtptr(ary, cur) + offset, s + offset, &diffpos, keylen - offset); ! if (hr < 0) { /* LESS */ left_outside = cur; offset += diffpos; ! } else if (hr == 0) { /* MATCH */ left_inside = cur; ! } else { /* ABOVE ... if (hr > 0) */ ary->ee = STRUCTURE; ! return _ERROR; } ! tmp = left_inside - (left_inside - left_outside) / 2; /* 980319 */ if (tmp < 0) tmp = 0; if (cur == tmp) *************** *** 175,189 **** cur = tmp; } ! /* ary->left, ary->right の再設定 */ ary->left = left_inside; ary->right = right_inside; return CONT; } ! ! /****************************************************************************** * int cmp_sistr(char *txt, char *str, int *diffpos, int len); * * purpose --- 191,206 ---- cur = tmp; } ! /* ! * ary->left, ary->right の再設定 ! */ ary->left = left_inside; ary->right = right_inside; return CONT; } ! /* * int cmp_sistr(char *txt, char *str, int *diffpos, int len); * * purpose *************** *** 204,249 **** * description * 980422 に大改造 * ! *****************************************************************************/ ! static int cmp_sistr(char *txt, char *str, int *diffpos, int len) { int i; for (i = 0; i < len; i++, txt++, str++) ! if(*txt != *str) { *diffpos = i; ! return ((unsigned char)*txt - (unsigned char)*str); } *diffpos = len; return 0; } ! /****************************************************************************** ! * void sa_reset(SUFARY *ary); * * purpose * SUFARY型変数aryのrightとleftを元に戻す * * parameters * ary : 対象array ! * ! * return value ! * なし ! * ! * description ! * SUFARY構造をユーザに直接見せないために必要 ! *****************************************************************************/ #define sa_reset(ary) \ { (ary)->left = 0; \ (ary)->right = (ary)->arraysize - 1; } - extern int Cha_lang_e; #define mbclen(mb) \ ! ((!Cha_lang_e && ((unsigned char)(mb) & 0x80)) ? 2 : 1) ! /****************************************************************************** * char **sa_common_prefix_search(SUFARY *ary, * char *pattern, ! * char* char_tyep, * char **result); * * purpose --- 221,260 ---- * description * 980422 に大改造 * ! */ ! static int ! cmp_sistr(char *txt, char *str, int *diffpos, int len) { int i; for (i = 0; i < len; i++, txt++, str++) ! if (*txt != *str) { *diffpos = i; ! return ((unsigned char) *txt - (unsigned char) *str); } *diffpos = len; return 0; } ! /* ! * sa_reset(SUFARY *ary); * * purpose * SUFARY型変数aryのrightとleftを元に戻す * * parameters * ary : 対象array ! */ #define sa_reset(ary) \ { (ary)->left = 0; \ (ary)->right = (ary)->arraysize - 1; } #define mbclen(mb) \ ! ((((unsigned char)(mb) & 0x80)) ? 2 : 1) ! /* * char **sa_common_prefix_search(SUFARY *ary, * char *pattern, ! * int pattern_len, * char **result); * * purpose *************** *** 252,258 **** * parameters * ary : 対象array * pattern : 検索キーワード ! * char_type : pattern 各位置の文字のバイト長 * result : 検索結果を格納するバッファ * * return value --- 263,269 ---- * parameters * ary : 対象array * pattern : 検索キーワード ! * pattern_len : pattern のバイト長 * result : 検索結果を格納するバッファ * * return value *************** *** 264,273 **** * 例: 辞書 a, abc, any, anybody, anymore, ... * 検索キーワード anybody * 結果 a, any, anybody ! * ! * since 1998/04/09 ! *****************************************************************************/ ! char **sa_common_prefix_search(SUFARY *ary, char *pattern, char *char_type, char **result) { int cursor; int result_last = 0; --- 275,284 ---- * 例: 辞書 a, abc, any, anybody, anymore, ... * 検索キーワード anybody * 結果 a, any, anybody ! */ ! char ** ! sa_common_prefix_search(SUFARY * ary, char *pattern, int pattern_len, ! char **result) { int cursor; int result_last = 0; *************** *** 276,286 **** sa_reset(ary); cursor = 0; ! while (1){ ! int next = cursor + mbclen(pattern[cursor]); if (sa_search(ary, pattern, next, cursor) != CONT) break; ! for (tmp = ary->left; tmp <= ary->right; tmp++){ char *entry = sa_aryidx2txtptr(ary, tmp); if (entry[next] != '\0') break; --- 287,298 ---- sa_reset(ary); cursor = 0; ! while (1) { ! int next = cursor + cha_tok_mblen(Cha_tokenizer, pattern + cursor, ! pattern_len - cursor); if (sa_search(ary, pattern, next, cursor) != CONT) break; ! for (tmp = ary->left; tmp <= ary->right; tmp++) { char *entry = sa_aryidx2txtptr(ary, tmp); if (entry[next] != '\0') break; diff -crN chasen-2.2.3/lib/sufary.h chasen-2.2.4/lib/sufary.h *** chasen-2.2.3/lib/sufary.h Mon Feb 19 13:37:06 2001 --- chasen-2.2.4/lib/sufary.h Fri Mar 16 13:06:17 2001 *************** *** 1,16 **** ! /*--------------------------------------------------------------* ! * * ! * SUFARY --- Suffix Array 検索のためのライブラリ * ! * * ! * sufary.h - SUFARYライブラリヘッダファイル * ! * * ! *--------------------------------------------------------------*/ ! #ifndef _SUFARY_H_ ! #define _SUFARY_H_ #include "config.h" #include #ifndef KEYWORD_MAX_LENGTH #define KEYWORD_MAX_LENGTH 5000 --- 1,21 ---- ! /* ! * SUFARY --- Suffix Array 検索のためのライブラリ ! * sufary.h - SUFARYライブラリヘッダファイル ! * ! * $Id: sufary.h,v 1.13 2001/03/16 04:06:17 kazuma-t Exp $ ! */ ! #ifndef __SUFARY_H__ ! #define __SUFARY_H__ #include "config.h" + + #ifdef HAVE_WINSOCK2_H + #include + #endif + #ifdef HAVE_SYS_TYPES_H #include + #endif #ifndef KEYWORD_MAX_LENGTH #define KEYWORD_MAX_LENGTH 5000 *************** *** 18,58 **** /* コマンドの戻り値 */ typedef enum eresult_{ ! CONT, ! FAIL, ! EXIT, ! ERROR } eresult; /* エラーコード */ typedef enum eerror_ { ! NOERROR, ! COMMAND, ! MEMORY, ! FILEIN, ! FILEOUT, ! STRUCTURE, ! UNKNOWN } eerror; /* SUFARY構造体 */ typedef struct { ! eerror ee; /* グローバルエラーコードを保持 */ ! long arraysize; /* Array の大きさ */ ! long left; /* 検索範囲の左端(範囲の内側を指す) 旧 g_bottom */ ! long right; /* 検索範囲の右端(範囲の内側を指す) 旧 g_top */ ! off_t txtsz; /* テキストファイルのサイズ */ ! off_t arysz; /* アレイファイルのサイズ */ ! void *txtmap; /* テキストファイルのマップアドレス */ ! void *arymap; /* アレイファイルのマップアドレス */ } SUFARY; /* プロトタイプ宣言 汎用ルーチン */ /*** select.c ***/ ! char **sa_common_prefix_search(SUFARY*, char*, char*, char**); /*** chfile.c ***/ SUFARY *sa_openfiles(char*, char*); void sa_closefiles(SUFARY*); ! #endif /* _SUFARY_H_ */ --- 23,63 ---- /* コマンドの戻り値 */ typedef enum eresult_{ ! CONT, ! FAIL, ! EXIT, ! _ERROR } eresult; /* エラーコード */ typedef enum eerror_ { ! _NOERROR, ! COMMAND, ! MEMORY, ! FILEIN, ! FILEOUT, ! STRUCTURE, ! UNKNOWN } eerror; /* SUFARY構造体 */ typedef struct { ! eerror ee; /* グローバルエラーコードを保持 */ ! long arraysize; /* Array の大きさ */ ! long left; /* 検索範囲の左端(範囲の内側を指す) 旧 g_bottom */ ! long right; /* 検索範囲の右端(範囲の内側を指す) 旧 g_top */ ! off_t txtsz; /* テキストファイルのサイズ */ ! off_t arysz; /* アレイファイルのサイズ */ ! void *txtmap; /* テキストファイルのマップアドレス */ ! void *arymap; /* アレイファイルのマップアドレス */ } SUFARY; /* プロトタイプ宣言 汎用ルーチン */ /*** select.c ***/ ! char **sa_common_prefix_search(SUFARY*, char*, int, char**); /*** chfile.c ***/ SUFARY *sa_openfiles(char*, char*); void sa_closefiles(SUFARY*); ! #endif /* __SUFARY_H__ */ diff -crN chasen-2.2.3/lib/tokenizer.c chasen-2.2.4/lib/tokenizer.c *** chasen-2.2.3/lib/tokenizer.c Thu Jan 1 09:00:00 1970 --- chasen-2.2.4/lib/tokenizer.c Sat Mar 10 11:02:43 2001 *************** *** 0 **** --- 1,468 ---- + /* + * tokenizer.c - tokenize a string + * + * Copyright (C) 1996, 1997, 2000, 2001, + * Nara Institute of Science and Technology + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by Nara Institute of + * Science and Technology. + * 4. The name Nara Institute of Science and Technology may not be used to + * endorse or promote products derived from this software without specific + * prior written permission. + * + * + * THIS SOFTWARE IS PROVIDED BY Nara Institute of Science and Technology + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A + * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE Nara Institute + * of Science and Technology BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED + * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $Id: tokenizer.c,v 1.5 2001/03/10 02:02:43 kazuma-t Exp $ + */ + + #include + #include + + #include "chalib.h" + #include "tokenizer.h" + + #define is_space(c) (((c) == ' ') || ((c) == '\t')) + + enum ja_char_type { + JA_NOSTATE, + JA_SPACE, + PROLONGED, /* KATAKANA-HIRAGANA PROLONGED SOUND MARK */ + KATAKANA, /* KATAKANA LETTER (SMALL) [A-KE] */ + SMALL_KATAKANA, /* KATAKANA LETTER SMALL AIUEO, TU, YAYUYO, WA */ + FULL_LATIN, /* FULLWIDTH LATIN (CAPITAL|SMALL) LETTER [A-Z] */ + HALF_LATIN, /* LATIN (CAPITAL|SMALL) LETTER [A-Z] */ + JA_OTHER, + }; + + enum en_char_type { + EN_NOSTATE, + EN_SPACE, + EN_LATIN, + EN_OTHER, + }; + + static int euc_mblen(unsigned char*, int); + static int iso8859_mblen(unsigned char*, int); + static int utf8_mblen(unsigned char*, int); + + static int ja_char_type_parse(chasen_tok_t*,int,int*,int); + static int en_char_type_parse(chasen_tok_t*,int,int*,int); + + static enum ja_char_type + ja_euc_char_type(chasen_tok_t*, unsigned char *, int); + static enum ja_char_type + ja_utf8_char_type(chasen_tok_t*, unsigned char *, int); + + static enum en_char_type + en_char_type(chasen_tok_t*, unsigned char *, int); + + typedef int (*ja_char_type_get)(chasen_tok_t*,unsigned char*,int); + typedef int (*en_char_type_get)(chasen_tok_t*,unsigned char*,int); + + static int is_anno(chasen_tok_t*, unsigned char*, int); + static int is_anno2(anno_info*, unsigned char*, int); + + /* + * This function constructs a tokenizer object. + * If an error occurs, it terminates a process. + */ + chasen_tok_t * + cha_tok_new(int lang, int encode) + { + chasen_tok_t *tok; + + tok = cha_malloc(sizeof(chasen_tok_t)); + + tok->lang = lang; + tok->encode = encode; + tok->_is_malloced = 0; + tok->_char_type = tok->__static_char_type; + tok->_anno_type = tok->__static_anno_type; + tok->anno = NULL; + + if (lang == CHASEN_LANG_JA) { + if (encode == CHASEN_ENCODE_EUC) { + tok->_mblen = euc_mblen; + tok->_char_type_parse = ja_char_type_parse; + tok->_get_char_type = (ja_char_type_get)ja_euc_char_type; + } else if (encode == CHASEN_ENCODE_UTF8) { + tok->_mblen = utf8_mblen; + tok->_char_type_parse = ja_char_type_parse; + tok->_get_char_type = (ja_char_type_get)ja_utf8_char_type; + } + } else if (lang == CHASEN_LANG_EN) { + if (encode == CHASEN_ENCODE_ISO8859) { + tok->_mblen = iso8859_mblen; + tok->_char_type_parse = en_char_type_parse; + tok->_get_char_type = (en_char_type_get)en_char_type; + } else if (encode == CHASEN_ENCODE_UTF8) { + tok->_mblen = utf8_mblen; + tok->_char_type_parse = en_char_type_parse; + tok->_get_char_type = (en_char_type_get)en_char_type; + } + } else { + tok->_mblen = iso8859_mblen; + tok->_char_type_parse = en_char_type_parse; + tok->_get_char_type = (en_char_type_get)en_char_type; + } + + return tok; + } + + /* + * This function destroys the tokenizer object. + */ + void + cha_tok_delete(chasen_tok_t *tok) + { + if (tok->_is_malloced) { + cha_free(tok->_char_type); + cha_free(tok->_anno_type); + } + cha_free(tok); + } + + /* + * This function parses string str with len bytes. + */ + int + cha_tok_parse(chasen_tok_t *tok, unsigned char *str, int len) + { + int cursor, head; + int state, state0; + anno_info *anno = NULL; + + tok->string = str; + tok->string_len = len; + + if (len > sizeof(tok->__static_char_type)) { + tok->_char_type = cha_malloc(sizeof(int) * len); + tok->_anno_type = cha_malloc(sizeof(int) * len); + tok->_is_malloced = 1; + } + + memset(tok->_char_type, 0, sizeof(int) * len); + memset(tok->_anno_type, 0, sizeof(int) * len); + + state0 = state = 0; /* NOSTATE */ + for (cursor = head = 0; cursor < len; + cursor += tok->_mblen(str + cursor, len - cursor)) { + if (state0 < 0) { /* in annotation */ + if (is_anno2(anno, str, cursor)) { + state0 = 0; /* end of annotation */ + } else { + continue; /* skip annotation */ + } + } + + state = is_anno(tok, str + cursor, len - cursor); + if (state < 0) { + anno = &(tok->anno[-state]); + tok->_anno_type[cursor] = -state; + } else { + state = tok->_get_char_type(tok, str + cursor, len - cursor); + state = tok->_char_type_parse(tok, state, &state0, cursor); + } + + if (state != state0) { + tok->_char_type[head] = cursor - head; + head = cursor; + } + state0 = state; + } + tok->_char_type[head] = cursor - head; + + return 1; + } + + /* + * This function returns the length in bytes of the multibyte character + * on cursor in the parsed string. + * + * If the character is `\0', it returns 1. + */ + int + cha_tok_mblen_on_cursor(chasen_tok_t *tok, int cursor) + { + return tok->_mblen(tok->string + cursor, + tok->string_len - cursor); + } + + /* + * This function returns the length in bytes of the multibyte character + * str with len bytes. + * + * If the character is `\0', it returns 1. + */ + int + cha_tok_mblen(chasen_tok_t *tok, unsigned char *str, int len) + { + return tok->_mblen(str, len); + } + + /* + * This function returns the length in bytes of the substring + * including same kind of characters. + * + * In the middle of substring, it returns 0. + */ + int + cha_tok_char_type_len(chasen_tok_t *tok, int cursor) + { + return tok->_char_type[cursor]; + } + + /* + * This function sets information of annotation anno in tokenizer tok. + */ + void + cha_tok_set_annotation(chasen_tok_t *tok, anno_info *anno) + { + tok->anno = anno; + } + + /* + * This function returns the type of annotation, if there is + * the begining of annotation on cursor. The type is more than 0. + * + * And it returns less than 0 value, if the character on cursor is + * white space. + * + * Otherwise it returns 0. + */ + int + cha_tok_anno_type(chasen_tok_t *tok, int cursor) + { + return tok->_anno_type[cursor]; + } + + /* + * This function is ad-hoc. You should not use it. + * + * But it is used in lib/parse.c(cha_parse_sentence). + */ + int + cha_tok_is_jisx0208_latin(chasen_tok_t *tok, int cursor, int len) + { + if ((tok->lang == CHASEN_LANG_JA) && + (tok->_get_char_type(tok, tok->string + cursor, len) == FULL_LATIN)) + return 1; + else + return 0; + + } + + /* + * private functions + */ + static int + euc_mblen(unsigned char *str, int len) + { + if (len >= 3 && + str[0] == 0x8f && (str[1] & 0x80) && (str[2] & 0x80)) { + return 3; + } else if (len >= 2 && (str[0] & 0x80) && (str[1] & 0x80)) { + return 2; + } + + return 1; + } + + static int + iso8859_mblen(unsigned char *str, int len) + { + return 1; + } + + static int + utf8_mblen(unsigned char *str, int len) + { + if (len >= 4 && (str[0] & 0xf0) == 0xf0 && + (str[1] & 0x80) && (str[2] & 0x80) && (str[3] & 0x80)) { + return 4; + } else if (len >= 3 && (str[0] & 0xe0) == 0xe0 && + (str[1] & 0x80) && (str[2] & 0x80)) { + return 3; + } else if (len >= 2 && (str[0] & 0xc0) == 0xc0 && (str[1] & 0x80)) { + return 2; + } + + return 1; + } + + static int + ja_char_type_parse(chasen_tok_t *tok, int state, int *state0, int cursor) + { + if (state == JA_SPACE) { + tok->_anno_type[cursor] = -1; + } else if ((state == HALF_LATIN) || + (state == FULL_LATIN)) { + ; /* do nothing */ + } else if (((*state0 == KATAKANA) && + ((state == PROLONGED) || + (state == SMALL_KATAKANA))) || + (state == KATAKANA)) { + state = KATAKANA; + } else { + state = JA_OTHER; + *state0 = JA_NOSTATE; + } + + return state; + } + + static int + en_char_type_parse(chasen_tok_t *tok, int state, int *state0, int cursor) + { + if (state == EN_SPACE) { + tok->_anno_type[cursor] = -1; + } else if (state == EN_OTHER) { + *state0 = EN_NOSTATE; + } + + return state; + } + + static enum ja_char_type + ja_euc_char_type(chasen_tok_t *tok, unsigned char *str, int len) + { + int mblen = tok->_mblen(str, len); + + if (mblen == 1) { + if (isalpha(str[0])) { + return HALF_LATIN; + } else if (is_space(str[0])) { + return JA_SPACE; + } + } else if (mblen == 2) { + if ((str[0] == 0xa1) && (str[1] == 0xbc)) { + return PROLONGED; + } else if (str[0] == 0xa5) { + if ((str[1] == 0xa1) || (str[1] == 0xa3) || + (str[1] == 0xa5) || (str[1] == 0xa7) || + (str[1] == 0xa9) || (str[1] == 0xc3) || + (str[1] == 0xe3) || (str[1] == 0xe5) || + (str[1] == 0xe7) || (str[1] == 0xee)) { + return SMALL_KATAKANA; + } else { + return KATAKANA; + } + } else if ((str[0] == 0xa3) && (str[1] >= 0xc1)) { + return FULL_LATIN; + } + } + + return JA_OTHER; + } + + static enum ja_char_type + ja_utf8_char_type(chasen_tok_t *tok, unsigned char *str, int len) + { + int mblen = tok->_mblen(str, len); + + if (mblen == 1) { + if (isalpha(str[0])) { + return HALF_LATIN; + } else if (is_space(str[0])) { + return JA_SPACE; + } + } else if (mblen == 3) { + if ((str[0] == 0xe3) && (str[1] == 0x83) && (str[2] == 0xbc)) { + return PROLONGED; + } else if (str[0] == 0xe3) { + if (((str[1] == 0x82) && + ((str[2] == 0xa1) || (str[2] == 0xa3) || + (str[2] == 0xa5) || (str[2] == 0xa7) || + (str[2] == 0xa9))) || + ((str[1] == 0x83) && + ((str[2] == 0x83) || (str[2] == 0xa3) || + (str[2] == 0xa5) || (str[2] == 0xa7) || + (str[2] == 0xae)))) { + return SMALL_KATAKANA; + } else if (((str[1] == 0x82) && + (str[2] >= 0xa1) && (str[2] <= 0xbf)) || + ((str[1] == 0x83) && + (str[2] >= 0x80) && (str[2] <= 0xBA))) { + return KATAKANA; + } + } else if ((str[0] == 0xef) && + (((str[1] == 0xbc) && + (str[2] >= 0xa1) && (str[2] <= 0xba)) || + ((str[1] == 0xbd) && + (str[2] >= 0x81) && (str[2] <= 0x9a)))) { + return FULL_LATIN; + } + } + + return JA_OTHER; + } + + static enum en_char_type + en_char_type(chasen_tok_t *tok, unsigned char *str, int len) + { + unsigned char c = str[0]; + + if (is_space(c)) { + return EN_SPACE; + } else if (isalpha(c)) { /* for English only */ + return EN_LATIN; + } + + return EN_OTHER; + } + + + static int + is_anno(chasen_tok_t *tok, unsigned char *string, int len) + { + int i; + anno_info *anno = tok->anno; + + if (anno == NULL) { + return 0; + } + for (i = 1; (anno[i].str1 != NULL); i++) { + if (len < anno[i].len1) { + continue; + } + if (!memcmp(string, anno[i].str1, anno[i].len1)) { + return -i; + } + } + return 0; + } + + static int + is_anno2(anno_info *anno, unsigned char *bos, int cursor) + { + int len2 = anno->len2; + + if (cursor < len2) { + return 0; + } + + return (memcmp(bos + cursor - len2, anno->str2, len2) == 0); + } diff -crN chasen-2.2.3/lib/tokenizer.h chasen-2.2.4/lib/tokenizer.h *** chasen-2.2.3/lib/tokenizer.h Thu Jan 1 09:00:00 1970 --- chasen-2.2.4/lib/tokenizer.h Sat Mar 10 11:02:43 2001 *************** *** 0 **** --- 1,59 ---- + /* + * $Id: tokenizer.h,v 1.4 2001/03/10 02:02:43 kazuma-t Exp $ + */ + + #ifndef __TOKENIZER_H__ + #define __TOKENIZER_H__ + + #include "chalib.h" + + /* for language */ + enum cha_lang { + CHASEN_LANG_JA, + CHASEN_LANG_EN + }; + + /* for encoding scheme */ + enum cha_encode { + CHASEN_ENCODE_EUC, + CHASEN_ENCODE_ISO8859, + CHASEN_ENCODE_UTF8 + }; + + typedef struct _chasen_tok_t chasen_tok_t; + struct _chasen_tok_t { + enum cha_lang lang; + enum cha_encode encode; + unsigned char *string; + int string_len; + anno_info *anno; + /* private member */ + int *_char_type; + int *_anno_type; + int _is_malloced; + int __static_char_type[CHA_INPUT_SIZE]; + int __static_anno_type[CHA_INPUT_SIZE]; + int (*_mblen)(unsigned char*, int); + int (*_get_char_type)(chasen_tok_t*,unsigned char*, int); + int (*_char_type_parse)(chasen_tok_t*,int,int*,int); + }; + + extern enum cha_lang Cha_lang; + extern enum cha_encode Cha_encode; + extern chasen_tok_t *Cha_tokenizer; + + chasen_tok_t *cha_tok_new(int, int); + void cha_tok_delete(chasen_tok_t*); + + int cha_tok_parse(chasen_tok_t*, unsigned char*, int); + int cha_tok_mblen_on_cursor(chasen_tok_t*, int); + + int cha_tok_mblen(chasen_tok_t*,unsigned char*,int); + int cha_tok_char_type_len(chasen_tok_t*, int); + + void cha_tok_set_annotation(chasen_tok_t*, anno_info*); + int cha_tok_anno_type(chasen_tok_t*, int); + + int cha_tok_is_jisx0208_latin(chasen_tok_t*, int, int); + + #endif /*__TOKENIZER_H__ */ diff -crN chasen-2.2.3/lib/zentohan.c chasen-2.2.4/lib/zentohan.c *** chasen-2.2.3/lib/zentohan.c Fri Feb 23 21:51:34 2001 --- chasen-2.2.4/lib/zentohan.c Sat Feb 24 15:17:22 2001 *************** *** 37,173 **** * * 1991/01/08/Tue Yutaka MYOKI(Nagao Lab., KUEE) * ! * $Id: zentohan.c,v 1.4 2001/02/23 12:51:34 kazuma-t Exp $ */ #include #include #define iskanji(x) ((unsigned char)(x) & 0x80) ! #ifdef SJIS ! ! /*********************************************************************** * euc->sjis, sjis->euc hankakukana->zenkaku code translation ! ***********************************************************************/ ! unsigned char *euc2sjis(unsigned char *str) { unsigned char *s; ! for(s=str;*s;s++){ ! if (*s>=0x80){ ! if (*s & 1) { ! *(s+1) -= 0x61; ! if (*(s+1) >= 0x7f) (*(s+1))++; ! } else *(s+1) -= 2; ! ! *s = ((*s + 1) >> 1) + 0x30; ! if (*s >= 0xa0) *s += 0x40; ! s++; ! } } return str; } ! /*********************************************************************** * hankana2zenkana1 * * return code: もとのポインタを必要に応じて進めたポインタ ! ***********************************************************************/ ! static unsigned char *hankana2zenkana1(unsigned char *moto, unsigned char *ato) { static unsigned char hankaku[] = ! {0x81, 0x42, 0x81, 0x75, 0x81, 0x76, 0x81, 0x41, 0x81, 0x45, 0x83, 0x92, ! 0x83, 0x40, 0x83, 0x42, 0x83, 0x44, 0x83, 0x46, 0x83, 0x48, 0x83, 0x83, ! 0x83, 0x85, 0x83, 0x87, 0x83, 0x62, 0x81, 0x5b, 0x83, 0x41, 0x83, 0x43, ! 0x83, 0x45, 0x83, 0x47, 0x83, 0x49, 0x83, 0x4a, 0x83, 0x4c, 0x83, 0x4e, ! 0x83, 0x50, 0x83, 0x52, 0x83, 0x54, 0x83, 0x56, 0x83, 0x58, 0x83, 0x5a, ! 0x83, 0x5c, 0x83, 0x5e, 0x83, 0x60, 0x83, 0x63, 0x83, 0x65, 0x83, 0x67, ! 0x83, 0x69, 0x83, 0x6a, 0x83, 0x6b, 0x83, 0x6c, 0x83, 0x6d, 0x83, 0x6e, ! 0x83, 0x71, 0x83, 0x74, 0x83, 0x77, 0x83, 0x7a, 0x83, 0x7d, 0x83, 0x7e, ! 0x83, 0x80, 0x83, 0x81, 0x83, 0x82, 0x83, 0x84, 0x83, 0x86, 0x83, 0x88, ! 0x83, 0x89, 0x83, 0x8a, 0x83, 0x8b, 0x83, 0x8c, 0x83, 0x8d, 0x83, 0x8f, ! 0x83, 0x93, 0x81, 0x4a, 0x81, 0x4b}; ! unsigned char *p, *s; int dakuten; p = ato; s = moto; ! if (*(s+1) == 0xde && ! (*s>=0xb6 && *s<=0xc4 || *s>=0xca && *s<=0xce)) ! dakuten = 1; ! else if (*(s+1) == 0xdf && *s>=0xca && *s<=0xce) ! dakuten = 2; else ! dakuten = 0; *p++ = hankaku[(*s - 0xa1) * 2]; *p++ = hankaku[(*s - 0xa1) * 2 + 1] + dakuten; ! if (dakuten){ s++; } return s; } ! unsigned char *hankana2zenkana(unsigned char *str) { ! unsigned char tmp[8192 /*CHA_INPUT_SIZE*/], *p, *s; p = tmp; ! for (s = str; *s; s++){ ! if ((0x80 <= *s && *s < 0xa0) || (0xe0 <= *s && *s <= 0xfc)) { ! *p++ = *s++; ! *p++ = *s; ! } else if (0xa1<=*s && *s<=0xdf) { ! s=hankana2zenkana1(s,p); ! p+=2; ! } else { ! *p++ = *s; ! } } *p = '\0'; strcpy(str, tmp); return str; } ! ! static void sjis2euc1(unsigned char *hi, unsigned char *lo) { ! if (*hi >= 0xe0) (*hi) -= 0x40; *hi = ((*hi - 0x30) << 1); ! ! if (*lo >= 0x9f) (*lo) += 2; else { (*hi)--; ! if (*lo >= 0x80) (*lo) += 0x60; ! else (*lo) += 0x61; } } ! ! unsigned char *sjis2euc(unsigned char *str) { ! unsigned char tmp[8192 /*CHA_INPUT_SIZE*/], *p, *s; if (!str[0]) ! return str; p = tmp; ! for (s = str; *s; s++){ ! if ((0x80 <= *s && *s < 0xa0) || (0xe0 <= *s && *s <= 0xfc)) { ! *p++ = *s++; ! *p++ = *s; ! sjis2euc1(p-2,p-1); ! } else if (0xa1<=*s && *s<=0xdf) { ! s=hankana2zenkana1(s,p); ! p+=2; ! sjis2euc1(p-2,p-1); ! } else { ! *p++ = *s; ! } } *p = '\0'; strcpy(str, tmp); return str; } ! ! #endif /* SJIS */ --- 37,190 ---- * * 1991/01/08/Tue Yutaka MYOKI(Nagao Lab., KUEE) * ! * $Id: zentohan.c,v 1.5 2001/02/24 06:17:22 kazuma-t Exp $ */ + #ifdef SJIS + #include #include #define iskanji(x) ((unsigned char)(x) & 0x80) ! /* * euc->sjis, sjis->euc hankakukana->zenkaku code translation ! */ ! unsigned char * ! euc2sjis(unsigned char *str) { unsigned char *s; ! for (s = str; *s; s++) { ! if (*s >= 0x80) { ! if (*s & 1) { ! *(s + 1) -= 0x61; ! if (*(s + 1) >= 0x7f) ! (*(s + 1))++; ! } else ! *(s + 1) -= 2; ! ! *s = ((*s + 1) >> 1) + 0x30; ! if (*s >= 0xa0) ! *s += 0x40; ! s++; ! } } return str; } ! /* * hankana2zenkana1 * * return code: もとのポインタを必要に応じて進めたポインタ ! */ ! static unsigned char * ! hankana2zenkana1(unsigned char *moto, unsigned char *ato) { static unsigned char hankaku[] = ! { 0x81, 0x42, 0x81, 0x75, 0x81, 0x76, 0x81, 0x41, ! 0x81, 0x45, 0x83, 0x92, 0x83, 0x40, 0x83, 0x42, ! 0x83, 0x44, 0x83, 0x46, 0x83, 0x48, 0x83, 0x83, ! 0x83, 0x85, 0x83, 0x87, 0x83, 0x62, 0x81, 0x5b, ! 0x83, 0x41, 0x83, 0x43, 0x83, 0x45, 0x83, 0x47, ! 0x83, 0x49, 0x83, 0x4a, 0x83, 0x4c, 0x83, 0x4e, ! 0x83, 0x50, 0x83, 0x52, 0x83, 0x54, 0x83, 0x56, ! 0x83, 0x58, 0x83, 0x5a, 0x83, 0x5c, 0x83, 0x5e, ! 0x83, 0x60, 0x83, 0x63, 0x83, 0x65, 0x83, 0x67, ! 0x83, 0x69, 0x83, 0x6a, 0x83, 0x6b, 0x83, 0x6c, ! 0x83, 0x6d, 0x83, 0x6e, 0x83, 0x71, 0x83, 0x74, ! 0x83, 0x77, 0x83, 0x7a, 0x83, 0x7d, 0x83, 0x7e, ! 0x83, 0x80, 0x83, 0x81, 0x83, 0x82, 0x83, 0x84, ! 0x83, 0x86, 0x83, 0x88, 0x83, 0x89, 0x83, 0x8a, ! 0x83, 0x8b, 0x83, 0x8c, 0x83, 0x8d, 0x83, 0x8f, ! 0x83, 0x93, 0x81, 0x4a, 0x81, 0x4b ! }; ! unsigned char *p, *s; int dakuten; p = ato; s = moto; ! if (*(s + 1) == 0xde && ! (*s >= 0xb6 && *s <= 0xc4 || *s >= 0xca && *s <= 0xce)) ! dakuten = 1; ! else if (*(s + 1) == 0xdf && *s >= 0xca && *s <= 0xce) ! dakuten = 2; else ! dakuten = 0; *p++ = hankaku[(*s - 0xa1) * 2]; *p++ = hankaku[(*s - 0xa1) * 2 + 1] + dakuten; ! if (dakuten) { s++; } return s; } ! unsigned char * ! hankana2zenkana(unsigned char *str) { ! unsigned char tmp[8192]; /* CHA_INPUT_SIZE */ ! unsigned char *p, *s; p = tmp; ! for (s = str; *s; s++) { ! if ((0x80 <= *s && *s < 0xa0) || (0xe0 <= *s && *s <= 0xfc)) { ! *p++ = *s++; ! *p++ = *s; ! } else if (0xa1 <= *s && *s <= 0xdf) { ! s = hankana2zenkana1(s, p); ! p += 2; ! } else { ! *p++ = *s; ! } } *p = '\0'; strcpy(str, tmp); return str; } ! static void ! sjis2euc1(unsigned char *hi, unsigned char *lo) { ! if (*hi >= 0xe0) ! (*hi) -= 0x40; *hi = ((*hi - 0x30) << 1); ! ! if (*lo >= 0x9f) ! (*lo) += 2; else { (*hi)--; ! if (*lo >= 0x80) ! (*lo) += 0x60; ! else ! (*lo) += 0x61; } } ! unsigned char * ! sjis2euc(unsigned char *str) { ! unsigned char tmp[8192]; /* CHA_INPUT_SIZE */ ! unsigned char *p, *s; if (!str[0]) ! return str; p = tmp; ! for (s = str; *s; s++) { ! if ((0x80 <= *s && *s < 0xa0) || (0xe0 <= *s && *s <= 0xfc)) { ! *p++ = *s++; ! *p++ = *s; ! sjis2euc1(p - 2, p - 1); ! } else if (0xa1 <= *s && *s <= 0xdf) { ! s = hankana2zenkana1(s, p); ! p += 2; ! sjis2euc1(p - 2, p - 1); ! } else { ! *p++ = *s; ! } } *p = '\0'; strcpy(str, tmp); return str; } ! #endif /* SJIS */