diff -crP php-2.0.1/README.jp php-2.0.1.jp_urat-5.3/README.jp *** php-2.0.1/README.jp Thu Jan 1 09:00:00 1970 --- php-2.0.1.jp_urat-5.3/README.jp Thu Mar 26 03:53:40 1998 *************** *** 0 **** --- 1,329 ---- + PHP/FI 2.0.1 ÆüËܸì+¦Á¥Ñ¥Ã¥Á 5.3 1998.3.26 + php-2.0.1.jp_urat-5.3.gz + md5 checksum: + ±ºÆÊ ͵ 8-->8-->8-->8 + + README.kanji.euc === PHP/FI ´Á»ú¥³¡¼¥É¥Ï¥ó¥É¥ê¥ó¥° Âè 5 ÈÇ === + + PHP/FI 2.0b12 ÍÑ + + ====================================================================== + INTRO + ---------------------------------------------------------------------- + PHP/FI¤Ë¤Æ´Á»ú¥³¡¼¥É¤ÎÊÑ´¹¤ò¼Â¸½¤¹¤ë¥Ñ¥Ã¥±¡¼¥¸¤òÁȤó¤Ç¤ß¤Þ¤·¤¿¡£ + ¤Ç¤¹¤¬¡¢¤³¤Î¥³¡¼¥É¤Ï¤Û¤Ü¤¹¤Ù¤Æ¡Ö¤Ñ¤¯¤ê¡×¤Ç¤¹ ;-) + + ¥Ù¡¼¥¹¤Ï¡¢ + 1. shige@csk.JUNET ¤µ¤ó¤Î filters-2.0 + 2. º´Æ£¸øÉ§(K.Sato)¤µ¤ó¤Î qkc 1.0 + 3. ëËܹ§¹À(NBC02362@niftyserve.or.jp)¤Î gawk-2.15.4+mb1.03 + ¤«¤éÇÒ¼Ú¤¤¤¿¤·¤Þ¤·¤¿¡£ + + ====================================================================== + INSTALL + ---------------------------------------------------------------------- + PHP/FI 2.0b12 ¤Î¥½¡¼¥¹¤Î¥È¥Ã¥×¥Ç¥£¥ì¥¯¥È¥ê(.../php-2.0b12/)¤Ç + % patch -p1 < kanji2.0b12.patch + ¤ò¼Â¹Ô¤·¡¢¤¢¤È¤Ï PHP/FI ¤Î¥È¥Ã¥×¥Ç¥£¥ì¥¯¥È¥ê + (.../php-2.0b12/)¤Ë¤ÆÄ̾ïÄ̤ê¤Î + % ./install + ¤È¤·¤Þ¤¹¡£ + + install¥¹¥¯¥ê¥×¥È¤ÎºÇ¸å¤Ë¡¢ + Do you want to use multi-byte extension for regex library? [y/N] + ¤È¿Ö¤«¤ì¤Þ¤¹¤Î¤Ç¡¢¤³¤³¤Ç¡ÖY¡×¤òÆþÎϤ¹¤ë¤È¡¢ + ¥Þ¥ë¥Á¥Ð¥¤¥È³ÈÄ¥Àµµ¬É½¸½¤¬»È¤¨¤ë¤è¤¦¤Ë¤Ê¤ê¤Þ¤¹¡£ + + # ¾Ü¤·¤¯¤Ï¡¢jp.regex/README.MB ¤ò»²¾È¤·¤Æ²¼¤µ¤¤¡£ + # ¤¿¤À¤·¡¢gawk-2.15.4+mb1.03¤Î¤â¤Î¤ò¤½¤Î¤Þ¤ÞÃÖ¤¤¤Æ¤¢¤ê¤Þ¤¹¤Î¤Ç¡¢ + # ¸æÎ»¾µ¤Î¤Û¤É¤ò... + + ¤Þ¤¿¡¢¥·¥¹¥Æ¥à¤ËPOSIXÀµµ¬É½¸½¤Î¥Ñ¥Ã¥±¡¼¥¸¤¬¤Ê¤¤¾ì¹ç¡¢ + .../php-2.0b12/config.h ¤Î + + /* Define if you have the regcomp function. */ + #define HAVE_REGCOMP 1 + + ¤¬¥³¥á¥ó¥È¥¢¥¦¥È¤µ¤ì¤Æ¤¤¤ë¤³¤È¤¬¤¢¤ê¤Þ¤¹¡£ + ¤½¤Î¾ì¹ç¥Þ¥ë¥Á¥Ð¥¤¥È³ÈÄ¥Àµµ¬É½¸½¤òÍ­¸ú¤Ë¤¹¤ë¤¿¤á¡¢ + ¤³¤Î¹Ô¤ò #define ¤·¤Æ²¼¤µ¤¤¡£ + (¾åµ­¤Î¤è¤¦¤Ë¤Ê¤Ã¤Æ¤¤¤ì¤Ð OK ¤Ç¤¹¡£) + + ====================================================================== + APACHE MODULE + ---------------------------------------------------------------------- + ¥¢¥Ñ¥Ã¥Á¤Î¥â¥¸¥å¡¼¥ë¤È¤·¤Æ¼Â¹Ô¤µ¤ì¤ë¾ì¹ç¡¢¾åµ­¥Þ¥ë¥Á¥Ð¥¤¥È³ÈÄ¥¤Îlibregex + ¤ò»ÈÍѤ¹¤ë¤Ë¤Ï¡¢¥¢¥Ñ¥Ã¥Á¤Î¥³¥ó¥Ñ¥¤¥ë»þ¤Ë + jp.regex/libregex.a + jp.regex/regex.h + ¤Îξ¥Õ¥¡¥¤¥ë¤¬É¬ÍפȤʤê¤Þ¤¹¡£ + + 1. ¾åµ­¥Õ¥¡¥¤¥ë¤ò¥¢¥Ñ¥Ã¥Á¤Î¥½¡¼¥¹¥Ç¥£¥ì¥¯¥È¥ê¤Ë¥³¥Ô¡¼¤·¤Þ¤¹¡£ + + % cp -r jp.regex /usr/local/etc/httpd + ( /usr/local/etc/httpd ¤ÏŬÅö¤Ê¥Ç¥£¥ì¥¯¥È¥ê¤ËÊѹ¹¤·¤Æ²¼¤µ¤¤) + + 2. Configuration ¤ÎÃæ¤Î¥³¥ó¥Ñ¥¤¥ë¥Õ¥é¥Ã¥°¤Ë jp.regex ¤ò²Ã¤¨¤Þ¤¹¡£ + PHP/FI ¤Î¥é¥¤¥Ö¥é¥ê¤È¥½¡¼¥¹¤â²Ã¤¨¤Þ¤¹¡£ + + EXTRA_CFLAGS= -I./jp.regex -DSERVER_SUBVERSION=\"PHP/FI-2.0b12\" + EXTRA_LIBS= -L. -lphp -lgdbm -L/usr/local/pgsql/lib -lpq -L./jp.regex -lregex -lm + + 3. Configuration¤ÎWANTHSREGEX¤ÎÃͤòdefault ¤«¤é no ¤ËÊѹ¹¤·¤Æ¡¢ + ÇÛÉÛ regex ¥Ñ¥Ã¥±¡¼¥¸¤Î¥ê¥ó¥¯¤ò¤·¤Ê¤¤ÍѤˤ·¤Þ¤¹¡£ + + Rule WANTHSREGEX=default + ¢­ + Rule WANTHSREGEX=no + + # apache ML ¤Î ¤Ä¤Ã¤·¡¼(m2@soum.co.jp)¤µ¤ó ¤¢¤ê¤¬¤È¤¦¤´¤¶¤¤¤Þ¤¹¡£ + + 4. Configuration ¤ÎºÇ¸å¤Ë PHP/FI ¤Î Module ¤òÀë¸À¤·¤Þ¤¹¡£ + + Module php_module mod_php.o + + 5. Configure ¤ò¼Â¹Ô¤·¤Æ Makefile ¤ò¤Ä¤¯¤ê¤Þ¤¹¡£ + + % ./Configure + + 6. make ¤·¤Þ¤¹¡£ + + % make + + ====================================================================== + HOW-TO + ---------------------------------------------------------------------- + ¤³¤Î¥Ñ¥Ã¥±¡¼¥¸¤Ï¡¢PHP/FI ¾å¤Ç´Á»ú¥³¡¼¥É¤ò»È¤¦ºÝ¤ËÆâÉô¥³¡¼¥É¤ò + EUC ¤ËÅý°ì¤·¡¢¤½¤Î½ÐÎÏ¥³¡¼¥É¤ò¥»¥Ã¥È¤¹¤ë¤â¤Î¤Ç¤¹¡£ + + ¤Þ¤¿¡¢¿·¤·¤¤´Ø¿ô¤È¤·¤Æ¡¢ + MBstrlen(string) + MBsubstr(string,start,length) + ¤Î2¤Ä¤ò²Ã¤¨¤Æ¤¢¤ê¤Þ¤¹¡£ + + ---------------------------------------------------------------------- + 1. SetKanjiOutput(mode) + + ´Ø¿ô¡ÖSetKanjiOutput("¥â¡¼¥É")¡×¤ò»ØÄꤹ¤ë¤³¤È¤Ç¡¢ + ¤½¤Î½ÐÎÏ¥³¡¼¥É¤ò»ØÄꤹ¤ë¤³¤È¤¬¤Ç¤­¤Þ¤¹¡£ + + [¥â¡¼¥É] + ¥â¡¼¥É¤Ï { EUC | JIS | SJIS } ¤Î 3¼ïÎà ¤Ç¤¹¡£ + ʸ»úÎó¤È¤·¤Æ»ØÄꤷ¤Þ¤¹¡£ + + + Apache ¥µ¡¼¥Ð¡¼¤Î¾ì¹ç¤Ë¤Ï¡¢ + .htaccess ¥Õ¥¡¥¤¥ë¤ÎÃæ¤Ç + + phpKanjiOutput ¥â¡¼¥É + + ¤ò»ØÄꤹ¤ë¤³¤È¤Ç¡¢½ÐÎÏ¥³¡¼¥É¤òÅý°ì¤·¤Þ¤¹¡£¥Ç¥Õ¥©¥ë¥È¤Ç¤Ï EUC ¤Ë¤Ê¤ê¤Þ¤¹¡£ + + ¤³¤ì¤òÊѹ¹¤·¤¿¾ì¹ç¤Ë¤Ï¡¢¤ª¼ê¿ô¤Ç¤¹¤¬¡¢¥½¡¼¥¹¥Ä¥ê¡¼(.../php-2.0b9/src/) + ¤Ë¤¢¤ë kanjiconv.c¤ò¼êư¤ÇÊѹ¹¤·¤Æ²¼¤µ¤¤¡£ + Apache ¤Î¾ì¹ç 61¹ÔÌÜ¡¢¤½¤ì°Ê³°¤Ç¤Ï 63¹ÔÌܤΠ+ + output_kanji_code=EUC; + + ¤Î¡ÖEUC¡×¤ÎÉôʬ¤ò JIS ¤Þ¤¿¤Ï SJIS ¤ËÊѹ¹¤·¤Æ²¼¤µ¤¤¡£ + ( ¤³¤³¤Ç¤Ï¡¢Ê¸»úÎó¤Ë¤Ï¤·¤Ê¤¤¤Ç²¼¤µ¤¤¡£ + php.h ¤ÎÃæ¤Ç #define ¤µ¤ì¤¿ int·¿ ¤Ë¤Ê¤ê¤Þ¤¹ ) + + ---------------------------------------------------------------------- + 2. MBstrlen(string) + + string ¤Ë»ØÄꤵ¤ì¤¿Ê¸»úÎó¤Î¡Öʸ»ú¿ô¡×¤òÊÖ¤·¤Þ¤¹¡£ + + ---------------------------------------------------------------------- + 3. MBsubstr(string, start, length) + + string ¤Ë»ØÄꤵ¤ì¤¿Ê¸»úÎ󤫤顢start ¤Ç»Ï¤Þ¤ê length ʸ»úʬ¤Îʸ»úÎó¤ò + ¼è¤ê½Ð¤·¤Þ¤¹¡£ + start ¤Ï 0 ¤«¤é»Ï¤Þ¤ëÀ°¿ô¤Ç¤¹¡£ + + ====================================================================== + EXAMPLE + ---------------------------------------------------------------------- + 1. SetKanjiOutput(mode) + $res "; + ... + + SetKanjiOutput("SJIS"); /* ¥Õ¥¡¥¤¥ë¤Ø SJIS ¤ÇÊݸ */ + $fp=fopen("/dos/result.txt","w"); + fputs($fp,$res); + fclose($fp); + ... + > + + ¥Ö¥é¥¦¥¶¤Î¤ß¤Ê¤é¤º¡¢¥Õ¥¡¥¤¥ë¤Ø¤Î½ÐÎϤâÀ©¸æ¤Ç¤­¤Þ¤¹¡£ + ¥Ç¥Ð¥¤¥¹¤Ø¤Î½ÐÎÏÁ°¤Ë»ÈÍѤ·¤Æ²¼¤µ¤¤¡£ + + ¥Ç¥Õ¥©¥ë¥È¤Î½ÐÎÏ¥³¡¼¥É¤Ï¡ÖEUC¡×¤Ç¤¹¡£ + + 2. MBstrlen(string) + + + + ·ë²Ì: + len = 14, mblen = 8 + + 3. MBsubstr(string) + + + + ·ë²Ì: + mitsu = ¤ª¤ß¤Ä + + + ====================================================================== + MACHINE + ---------------------------------------------------------------------- + ³ÎǧºÑ¤ßưºî´Ä¶­¤Ï¡¢ + Linux 2.0.27 (Slackware 3.1) + JE 0.9.8 + + PostgreSQL v6.0 + JP-patch + ¾å¤Î + apache_1.2.0 + php-2.0b12 + ¥Ð¡¼¥¸¥ç¥ó¤Ç¤¹¡£ + + ====================================================================== + CHANGES + ---------------------------------------------------------------------- + -- Âè 4 ÈÇ + * ¾¯¡¹¤Î¥Ð¥°¥Õ¥£¥¯¥¹ + + CGIÈÇ (php.cgi) ¤òºîÀ®¤¹¤ë¤È¤­¤Ë echo.c ¤Ç¥¨¥é¡¼¤¬µ¯¤³¤ëÉÔ¶ñ¹ç + + + jp.regex/libregex.a ¥¹¥¿¥Æ¥£¥Ã¥¯¥é¥¤¥Ö¥é¥ê¤ò¥ê¥ó¥¯¤·¤Ê¤¤ÉÔ¶ñ¹ç + + + install ¥¹¥¯¥ê¥×¥È¤Ç HAVE_REGCOMP ¤¬ÄêµÁ¤µ¤ì¤Ê¤¤(POSIX¤Î¥é¥¤¥Ö¥é¥ê + ¤ò»ý¤¿¤Ê¤¤)¥Þ¥·¥ó¤Ç¤ÎÉÔ¶ñ¹ç + + -- Âè 3 ÈÇ + * CGIÈǤΠEcho ´Ø¿ô¤¬¾ï¤Ë EUC ¤ò½ÐÎϤ¹¤ëÉÔ¶ñ¹ç¤ò½¤Àµ + + * Mail ´Ø¿ô¤ËÆüËܸì¤ò»ÈÍѤ¹¤ë¤È¡¢EUC¤Î¤Þ¤ÞÁ÷¤é¤ì¤ëÉÔ¶ñ¹ç¤ò½¤Àµ + + * MBstrlen, MBsubstr ¤Î¥Þ¥ë¥Á¥Ð¥¤¥È´Ø¿ô¤òÄɲà + + * Á°²óή¤·¤¿¡ÖKanjiConv(ʸ»úÎó,¥â¡¼¥É)¡×¤Ï¡¢ÆâÉô¥³¡¼¥É + ¤òÅý°ì¤·¤¿¤³¤È¤Ë¤è¤Ã¤Æµ¡Ç½¤·¤Ê¤¯¤Ê¤ê¤Þ¤·¤¿¤Î¤Ç¡¢ + ºï½ü¤µ¤»¤ÆÄº¤­¤Þ¤·¤¿¡£ + + + ====================================================================== + NOTICE + ---------------------------------------------------------------------- + * ¡Ö¤È¤ê¤¢¤¨¤ºÆ°¤¯¤â¤Î¡×¤òÂçÁ°Äó¤È¤·¤ÆÁȤߤޤ·¤¿¤Î¤Ç + ¤Ê¤Ë¤«¤·¤éÌäÂê¤â¤¢¤ë¤«¤È»×¤ï¤ì¤Þ¤¹¤¬¡¢´°Á´¤Ë̵ÊݾڤǤ¹¡£ + ³Æ¿Í¤ÎÀÕǤ¤Ë¤ª¤¤¤Æ¸æ»ÈÍѤ¯¤À¤µ¤¤¡£ + + * APACHE ¾å¤Ç¤ï¤¶¤ï¤¶ regex ¤Î¥Ñ¥Ã¥±¡¼¥¸¤¬ÇÛÉÛ¤µ¤ì¤Æ¤¤¤ë¤È¤¤¤¦¤³¤È¤Ï¡¢ + ²¿¤«ÌäÂ꤬¤¢¤ë¤Î¤«¤Ê? ¤È»×¤¤¤Ä¤Ä¡¢ÇÛÉۥѥ屡¼¥¸¤Ï»ÈÍѤ·¤Æ¤¤¤Þ¤»¤ó¡£ + ¤Ê¤Ë¤«ÌäÂ꤬¤¢¤ì¤Ð¡¢Ï¢Íí失¤ë¤È¹¬¤¤¤Ç¤¹¡£ + + ====================================================================== + TODO + ---------------------------------------------------------------------- + ¹â®²½ mmap() ¤ÎÂбþ (¤¤¤Þ¤À¤Ë...¤¹¤ß¤Þ¤»¤óÉÔÊÙ¶¯¤Ç _o_) + + ====================================================================== + 08-19-1997 Á°ÅĽ¼¹¨ + mitsu@tramp.co.jp + + -->8-->8-->8-->8 diff -crP php-2.0.1/config.h.in php-2.0.1.jp_urat-5.3/config.h.in *** php-2.0.1/config.h.in Tue May 27 07:35:37 1997 --- php-2.0.1.jp_urat-5.3/config.h.in Wed Feb 18 21:03:39 1998 *************** *** 40,46 **** #undef HAVE_STRCASECMP /* Define if you have the mmap function. */ ! #undef HAVE_MMAP /* Define if you have the putenv function. */ #undef HAVE_PUTENV --- 40,46 ---- #undef HAVE_STRCASECMP /* Define if you have the mmap function. */ ! /* #undef HAVE_MMAP */ /* Define if you have the putenv function. */ #undef HAVE_PUTENV diff -crP php-2.0.1/install php-2.0.1.jp_urat-5.3/install *** php-2.0.1/install Wed Nov 19 15:50:15 1997 --- php-2.0.1.jp_urat-5.3/install Wed Feb 18 21:03:39 1998 *************** *** 714,720 **** PHPSENDMAIL="" fi ! if grep "#define HAVE_REGCOMP 1" config.h >/dev/null 2>&1 then echo "Your system appears to have a Posix compliant regex library"; echo "On some systems this library is broken. UnixWare 2.0.x is an" --- 714,741 ---- PHPSENDMAIL="" fi ! $ECHO_N "Do you want to use multi-byte extension for regex library? [y/N] " ! read a ! if [ "$a" = "y" -o "$a" = Y ] ! then ! echo "Using multi-byte extension for regular expression library" ! echo "" ! LIBREGEX=-lregex ! LIBREGEXDIR=-L./jp.regex ! REGEX_INCLUDE=-I./jp.regex ! OURREGEX=jp.regex/libregex.a ! REGEXMSG1="For Apache 1.1.1 you must copy src/jp.regex/libregex.a and src/jp.regex/regex.h to $APACHE_DIR and add -I. and -llibregex.a to Configuration" ! REGEXMSG2="For Apache 1.2 or higher you must set the WANTHSREGEX Configuration parameter to N and copy src/jp.regex/libregex.a and src/jp.regex/regex.h to $APACHE_DIR then add -I. and -lregex to Configuration" ! STATICLIBREGEX=regex/libregex.a ! echo "#define MB 1" >> config.h ! ! if grep "#define HAVE_REGCOMP 1" config.h >/dev/null 2>&1 ! then ! sed "s/^.*HAVE_REGCOMP.*$/#define HAVE_REGCOMP 1/" < config.h >config.$$ ! rm -f config.h ! mv config.$$ config.h ! fi ! elif grep "#define HAVE_REGCOMP 1" config.h >/dev/null 2>&1 then echo "Your system appears to have a Posix compliant regex library"; echo "On some systems this library is broken. UnixWare 2.0.x is an" diff -crP php-2.0.1/patch/fileinode.patch php-2.0.1.jp_urat-5.3/patch/fileinode.patch *** php-2.0.1/patch/fileinode.patch Thu Jan 1 09:00:00 1970 --- php-2.0.1.jp_urat-5.3/patch/fileinode.patch Thu Mar 26 03:28:23 1998 *************** *** 0 **** --- 1,111 ---- + From php-list-return-5475-urat=first.tsukuba.ac.jp@php.iquest.net Sat Feb 7 08:17:52 1998 + Return-Path: php-list-return-5475-urat=first.tsukuba.ac.jp@php.iquest.net + Received: from iquest3.iquest.net (iquest3.iquest.net [209.43.20.203]) by daichi.first.tsukuba.ac.jp (8.8.8/3.4W396040220) with SMTP id IAA16086 for ; Sat, 7 Feb 1998 08:17:51 +0900 (JST) + Received: (qmail 25490 invoked by uid 54979); 6 Feb 1998 20:46:46 -0000 + Mailing-List: contact php-list-help@php.iquest.net; run by ezmlm + Delivered-To: mailing list php-list@php.iquest.net + Received: (qmail 25451 invoked from network); 6 Feb 1998 20:46:44 -0000 + Received: from mail.esc.de (HELO esc.de) (194.115.54.34) + by iquest3.iquest.net with SMTP; 6 Feb 1998 20:46:44 -0000 + Received: from workaholics.net (guivol@escpc23.esc.de [194.115.54.153]) + by esc.de (8.8.5/8.8.5) with ESMTP id VAA17307 + for ; Fri, 6 Feb 1998 21:46:44 +0100 + Sender: guivol@esc.de + Message-ID: <34DB7705.489FBD27@workaholics.net> + Date: Fri, 06 Feb 1998 21:48:06 +0100 + From: Guido Vollbeding + Organization: Independent Workaholics Network + X-Mailer: Mozilla 4.04 [en] (X11; I; Linux 2.0.33 i586) + MIME-Version: 1.0 + To: php-list@php.iquest.net + Subject: [PHP] fileInode($fd) instead of fileInode(filename)? + Content-Type: text/plain; charset=iso-8859-1 + Content-Transfer-Encoding: 8bit + Status: RO + + Hi folks, + + for a Digital Image Archive project based on a MySQL database with phtml + interface I need the PHP fileInode() function with a file descriptor + argument rather than the filename. + + Fortunately, it was easy to patch the php source for the desired feature + and it seems to work properly. + I use PHP 2.0.1 as module with Apache 1.2.5. + + The simple idea is to check the argument type (LNUMBER or STRING) in the + function implementation and call the standard library fstat() function + instead of stat() if the argument is of type LNUMBER instead of STRING. + Here is the patch: + + escpc23:/www/php/src # diff -C 2 -p file.c.orig file.c + *** file.c.orig Fri Feb 6 14:10:24 1998 + --- file.c Fri Feb 6 14:21:12 1998 + *************** void FileFunc(int type) { + *** 608,611 **** + --- 608,617 ---- + return; + } + + if (s->type == LNUMBER) { + + if (fstat((int)s->intval,&sb) == -1) { + + Push("-1",LNUMBER); + + return; + + } + + } else { + #if APACHE + if(!CurrentStatFile) { + *************** void FileFunc(int type) { + *** 632,635 **** + --- 638,642 ---- + } + } + + } + switch(type) { + case 0: /* fileperms */ + + Would it be reasonable to include this feature in future php releases? + + Here is the background for my need: + In order to achieve a flexible image data handling and access, I use a + modified filesystem (under Linux) *without* directories and filenames. + Instead, image files are accessed directly via the inode number. The + inode number is then stored in the corresponding database record for + reference. + + The image files can then simply be read through ordinary system functions + by giving the inode number as string as the "filename" argument in the + open-call. + In php it looks like: + $oresult = GetImageSize("$DOCUMENT_ROOT/imgo/$oino.jpg"); + " > + The "imgo" directory is the mount point for my modified file-system, + which simply converts the filename argument (like atol()) to get the + inode number for open (no dir search). + So far it works well without further modifications. + + However, when creating the file (via http file upload), I need a way + to fetch the generated inode number information. Since I don´t have + a ´valid´ filename at this point, I do this by creating an empty file + first and ask the inode number via the file descriptor. Afterwards, + I can rewrite the actual file as usual. + + $fd = fopen("$DOCUMENT_ROOT/imgo/a","w"); + $oino = fileInode($fd); + fclose($fd); + exec("cp $filename $DOCUMENT_ROOT/imgo/$oino.jpg"); + + (Note: "a" is an arbitrary ´dummy´ filename which is not used further + inside the modified filesystem. The only reference for later access is + the generated inode number, stored in the database instead of a dir.) + + This scheme seems to work well in the desired fashion, + with minimal modifications. + + Regards, + Guido + + ______________________________PHP/FI Mailing List______________________________ + To unsubscribe send an empty message to php-list-unsubscribe@php.iquest.net + To unsub you@host.com, use: php-list-unsubscribe-you=host.com@php.iquest.net + For help: php-list-help@php.iquest.net Archive: http://www.tryc.on.ca/php.html + diff -crP php-2.0.1/patch/log_speedup.patch php-2.0.1.jp_urat-5.3/patch/log_speedup.patch *** php-2.0.1/patch/log_speedup.patch Thu Jan 1 09:00:00 1970 --- php-2.0.1.jp_urat-5.3/patch/log_speedup.patch Thu Mar 26 03:28:15 1998 *************** *** 0 **** --- 1,77 ---- + From php-list-return-5424-urat=first.tsukuba.ac.jp@php.iquest.net Thu Feb 5 06:31:57 1998 + Return-Path: php-list-return-5424-urat=first.tsukuba.ac.jp@php.iquest.net + Received: from iquest3.iquest.net (iquest3.iquest.net [209.43.20.203]) by daichi.first.tsukuba.ac.jp (8.8.8/3.4W396040220) with SMTP id GAA07988 for ; Thu, 5 Feb 1998 06:31:56 +0900 (JST) + Received: (qmail 16597 invoked by uid 54979); 4 Feb 1998 21:30:25 -0000 + Mailing-List: contact php-list-help@php.iquest.net; run by ezmlm + Delivered-To: mailing list php-list@php.iquest.net + Received: (qmail 16574 invoked from network); 4 Feb 1998 21:30:24 -0000 + Received: from ns.viet.net (207.201.22.241) + by iquest3.iquest.net with SMTP; 4 Feb 1998 21:30:24 -0000 + Received: from localhost (tin@localhost) by ns.viet.net (8.8.8/8.8.8) with SMTP id NAA29317 for ; Wed, 4 Feb 1998 13:30:08 -0800 + Date: Wed, 4 Feb 1998 13:30:08 -0800 (PST) + From: Tin Le + X-Sender: tin@ns.viet.net + To: php-list@php.iquest.net + Subject: [PHP] speedup patch for php2 + Message-ID: + MIME-Version: 1.0 + Content-Type: TEXT/PLAIN; charset=US-ASCII + Status: RO + + For those who are still using PHP2 _and_ especially if you have logging + turned on for most of your web pages, here is a speed up patch that should + help gain you some extra performance. It works for me on a small site that + gets around 30K hits a day. I'll try it out on a larger site next. + + Background, I was looking into a problem I was having with Apache 1.2.5 and + both PHP2 and PHP3 compiled in as modules, when I noticed the minor speedup + potential in log.c. Essentially it's calling _RegReplace() to change all + '/' into '_'. I figured since that is a special case, no regexs or + anything else to worry about, it's safe to do this inline and save the + overhead of a function call, plus _RegReplace has all the other overheads. + I put #ifdef around the call to _RegReplace() in case anyone ever wants to + put that back. + + To apply this patch, cd to the src directory of PHP/FI 2. + + ------------php2.cdif---------------------------------- + *** log.c Wed Feb 4 12:57:37 1998 + --- log.c.new Sun Feb 1 23:01:01 1998 + *************** + *** 120,125 **** + --- 120,126 ---- + + char *filename_to_logfn(char *filename) { + char *lfn, *lp, *ret; + + int i; + + if (forcelogfile) { filename = forcelogfile; } + lfn = estrdup(1,filename); + *************** + *** 132,138 **** + --- 133,144 ---- + lp++; + } + } + + #if 0 + lp = _RegReplace("/","_",lp); + + #else + + for (i=0, lfn=lp; *lfn && itype == LNUMBER) { + if (fstat((int)s->intval,&sb) == -1) { + Push("-1",LNUMBER); + return; + } + } else { #if APACHE if(!CurrentStatFile) { CurrentStatFile = estrdup(0,php_rqst->filename); *************** *** 631,636 **** --- 637,643 ---- return; } } + } switch(type) { case 0: /* fileperms */ sprintf(temp,"%ld",(long)sb.st_mode); *************** *** 1255,1260 **** --- 1262,1268 ---- Push("",STRING); return; } + (void)conv2euc(buf,len); Push((buf=AddSlashes(buf,1)),STRING); } *************** *** 1291,1296 **** --- 1299,1305 ---- Push("",STRING); return; } + (void)conv2euc(buf,len); rbuf=estrdup(1,buf); c = *buf; lc=(char)0; *************** *** 1389,1395 **** } ParseEscapes(buf); StripSlashes(buf); ! ret = fputs(buf,fp); sprintf(temp,"%d",ret); Push(temp,STRING); } --- 1398,1404 ---- } ParseEscapes(buf); StripSlashes(buf); ! ret = kanji_fputs(buf,fp); sprintf(temp,"%d",ret); Push(temp,STRING); } diff -crP php-2.0.1/src/jp.regex/Makefile php-2.0.1.jp_urat-5.3/src/jp.regex/Makefile *** php-2.0.1/src/jp.regex/Makefile Thu Jan 1 09:00:00 1970 --- php-2.0.1.jp_urat-5.3/src/jp.regex/Makefile Wed Feb 18 21:03:40 1998 *************** *** 0 **** --- 1,18 ---- + SHELL = /bin/sh + + CFLAGS=-I. $(AUX_CFLAGS) + LFLAGS= + LIBS= + OBJS=dfa.o mbc.o regex.o + SRCS=dfa.c mbc.c regex.c + + .c.o: + $(CC) -c $(CFLAGS) $< + + lib: $(OBJS) + rm -f libregex.a + ar crv libregex.a $(OBJS) + $(RANLIB) libregex.a + + clean: + rm -f *.o libregex.a diff -crP php-2.0.1/src/jp.regex/README.MB php-2.0.1.jp_urat-5.3/src/jp.regex/README.MB *** php-2.0.1/src/jp.regex/README.MB Thu Jan 1 09:00:00 1970 --- php-2.0.1.jp_urat-5.3/src/jp.regex/README.MB Wed Feb 18 21:03:40 1998 *************** *** 0 **** --- 1,369 ---- + ¡ü¡ü Gnu Awk (gawk) 2.15, patchlevel 4 + multi-byte extension 1.03 ¡ü¡ü + ¡ü¡ü Aug. 29, 1994 by t^2 ¡ü¡ü + + gawk-2.15.4+mb1.03 -- ¥Þ¥ë¥Á¥Ð¥¤¥Èʸ»úÂбþÈÇ Gnu Awk + + ¡ü³µÍ× + + GNU ¥×¥í¥¸¥§¥¯¥È¤Ë¤è¤ë awk (°Ê²¼ gawk) ¤ò¥Þ¥ë¥Á¥Ð¥¤¥Èʸ»úÂбþ²½¤·¤¿ + ¤â¤Î¤Ç¤¹. + + ¡ü»ÈÍÑË¡ + + gawk ¤«¤é¤Î³ÈÄ¥Éôʬ¤À¤±¤òÀâÌÀ¤·¤Þ¤¹. + + Áý¤¨¤¿¥ª¥×¥·¥ç¥ó¤Ï°Ê²¼¤ÎÄ̤ê¤Ç¤¹. + + -Wctype=ASCII ¤Þ¤¿¤Ï --ctype=ASCII + ¥Þ¥ë¥Á¥Ð¥¤¥Èʸ»ú¤ò¹Íθ¤·¤Þ¤»¤ó. ¤³¤Î¥ª¥×¥·¥ç¥ó¤ò»ÈÍѤ·¤¿¾ì¹ç, + ¥ª¥ê¥¸¥Ê¥ë¤Î gawk ¤òñ¤Ë£¸¥Ó¥Ã¥È¥¯¥ê¡¼¥ó¤È¤·¤¿Æ°ºî¤È¤Ê¤ê¤Þ¤¹. + + -Wctype=EUC ¤Þ¤¿¤Ï --ctype=EUC + ¥Þ¥ë¥Á¥Ð¥¤¥Èʸ»ú¤È¤·¤Æ EUC ¤òǧ¼±¤·¤Þ¤¹. + + -Wctype=SJIS ¤Þ¤¿¤Ï --ctype=SJIS + ¥Þ¥ë¥Á¥Ð¥¤¥Èʸ»ú¤È¤·¤Æ Shift-JIS ¤òǧ¼±¤·¤Þ¤¹. + + MS-DOS °Ê³°¤Î¥·¥¹¥Æ¥à¤Ç, Makefile(.in)? ¤ò½ñ¤­´¹¤¨¤º¤Ë¥¤¥ó¥¹¥È¡¼ + ¥ë¤·¤¿¾ì¹ç, ¥Ç¥Õ¥©¥ë¥È¤Ç¤Ï EUC ¤òǧ¼±¤·¤Þ¤¹. MS-DOS ¤Ç¤Ï¥Ç¥Õ¥© + ¥ë¥È¤Ç Shift-JIS ¤òǧ¼±¤·¤Þ¤¹. + + °Ê²¼, ÁȤ߹þ¤ß´Ø¿ô¤Ê¤É¤ÎÊѹ¹ÅÀ¤òµó¤²¤Þ¤¹. + + substr() ¥Þ¥ë¥Á¥Ð¥¤¥Èʸ»ú¤òʬÃǤµ¤»¤ë¤è¤¦¤Ê»ØÄê¤Ï¼«Æ°Åª¤Ë½¤Àµ + ¤·¤Þ¤¹. Î㤨¤Ð, + substr("a¤¢i¤¤u¤¦e¤¨o¤ª", 3, 4) + ¤Ï, + substr("a¤¢i¤¤u¤¦e¤¨o¤ª", 2, 3) ==> "¤¢i" + ¤È¤¤¤¦·ë²Ì¤Ë¤Ê¤ê¤Þ¤¹. ¤É¤Á¤é¤Î°ú¿ô¤â¾®¤µ¤¯¤Ê¤ëÊý¸þ + ¤Ø´Ý¤á¤Þ¤¹. + + jindex() ¤³¤ì¤Ï, °ÊÁ°¤«¤é¤è¤¯¤¢¤ë index() ¤ÎÊÑ·Á¤Ç¤¹. + index() ¤Î·ë²Ì¤ò¥Ð¥¤¥È°ÌÃ֤ǤϤʤ¯Ê¸»ú°ÌÃÖ¤ÇÊÖ¤·¤Þ¤¹. + + jlength() Ʊ¤¸¤¯ length() ¤ÎÊÑ·Á¤Ç¤¹. ¥Ð¥¤¥È¿ô¤Ç¤Ï¤Ê¤¯Ê¸»ú¿ô + ¤òÊÖ¤·¤Þ¤¹. + + jsubstr() substr() ¤ÎÊÑ·Á¤Ç¤¹. ¥Ð¥¤¥È°ÌÃÖ, ¥Ð¥¤¥È¿ô¤Î»ØÄê¤ÎÂå + ¤ï¤ê¤Ëʸ»ú°ÌÃÖ, ʸ»ú¿ô¤ò»ØÄꤷ¤Þ¤¹. + + RS ¥Þ¥ë¥Á¥Ð¥¤¥Èʸ»ú¤ò»ÈÍѤ¹¤ë¤³¤È¤¬¤Ç¤­¤Þ¤¹. + + ¤½¤Î¾ ¡¦¼±Ê̻Ҥ˥ޥë¥Á¥Ð¥¤¥Èʸ»ú¤ò»ÈÍѤǤ­¤Þ¤¹. + + ¡¦¥Ñ¥¹Ì¾¤Ë´Þ¤Þ¤ì¤ë¥Þ¥ë¥Á¥Ð¥¤¥Èʸ»ú¤Ï¤Ë¤Ä¤¤¤Æ¤ÏÁ´¤¯¹Í + θ¤·¤Æ¤¤¤Þ¤»¤ó. Ãí°Õ¤·¤Æ¤¯¤À¤µ¤¤. + + ¡¦Ê¸»úÎó¤ÎÂç¾®Èæ³Ó¤Ï, Àµµ¬É½¸½Ãæ¤Îʸ»ú¥¯¥é¥¹¤ÎÈÏ°Ï»Ø + Äê¤ÈƱÍÍ, + + £±¥Ð¥¤¥È ASCII ʸ»ú < Ⱦ³Ñ¥«¥Êʸ»ú < Á´³Ñʸ»ú + + ¤È¤¤¤¦´Ø·¸¤Ë´ð¤Å¤¤¤ÆÈæ³Ó¤·¤Þ¤¹. + + ¡ü MS-DOS ÈǼ¹ԷÁ¼°¤ò´Þ¤à¥¢¡¼¥«¥¤¥Ö¤Ë¤Ä¤¤¤Æ (¤½¤ì°Ê³°¤Î·ÁÂÖ¤ÇÆþ¼ê¤µ¤ì¤¿ + Êý¤Ï̵»ë¤·¤Æ¤¯¤À¤µ¤¤) + + 1. ¥¢¡¼¥«¥¤¥Ö¤Ë´Þ¤Þ¤ì¤Æ¤¤¤ë¥Õ¥¡¥¤¥ë + + ¥ª¥ê¥¸¥Ê¥ë¤«¤éÁ´¤¯¼ê¤ò²Ã¤¨¤Æ¤¤¤Ê¤¤¥Õ¥¡¥¤¥ë + + ACKNOWLE ¥ª¥ê¥¸¥Ê¥ë¤Î¥½¡¼¥¹¤Ë´Þ¤Þ¤ì¤Æ¤¤¤ë ACKNOWLEDGMENT + COPYING ¥ª¥ê¥¸¥Ê¥ë¤Î¥½¡¼¥¹¤Ë´Þ¤Þ¤ì¤Æ¤¤¤ë COPYING + FUTURES ¥ª¥ê¥¸¥Ê¥ë¤Î¥½¡¼¥¹¤Ë´Þ¤Þ¤ì¤Æ¤¤¤ë FUTURES + LIMITATI ¥ª¥ê¥¸¥Ê¥ë¤Î¥½¡¼¥¹¤Ë´Þ¤Þ¤ì¤Æ¤¤¤ë LIMITATIONS + NEWS ¥ª¥ê¥¸¥Ê¥ë¤Î¥½¡¼¥¹¤Ë´Þ¤Þ¤ì¤Æ¤¤¤ë NEWS + POSIX ¥ª¥ê¥¸¥Ê¥ë¤Î¥½¡¼¥¹¤Ë´Þ¤Þ¤ì¤Æ¤¤¤ë POSIX + PROBLEMS ¥ª¥ê¥¸¥Ê¥ë¤Î¥½¡¼¥¹¤Ë´Þ¤Þ¤ì¤Æ¤¤¤ë PROBLEMS + README ¥ª¥ê¥¸¥Ê¥ë¤Î¥½¡¼¥¹¤Ë´Þ¤Þ¤ì¤Æ¤¤¤ë README + + gawk+mb ÍѤΥե¡¥¤¥ë + + CHANGELO.MB gawk+mb ¤ÎÊѹ¹ÍúÎò + README.MB ¤³¤Î¥Õ¥¡¥¤¥ë + + MS-DOS ÈÇ gawk+mb ÍѤΥե¡¥¤¥ë + + GAWK.CAT ¥ª¥ê¥¸¥Ê¥ë¤Î¥½¡¼¥¹¤Ë´Þ¤Þ¤ì¤Æ¤¤¤ë¥Þ¥Ë¥å¥¢¥ë¥Ú¡¼¥¸ + gawk.1 ¤ò GNU roff ¤Ç¥Õ¥©¡¼¥Þ¥Ã¥È¤·¤¿¤â¤Î. + GAWK.EXE MS-DOS ÈÇ gawk-2.15.4+mb1.03 ¤Î¼Â¹Ô·Á¼° + READMAN.SED sed ¤ò»ý¤Ã¤Æ¤¤¤ë¿Í¤Ø¤ª¤Þ¤± + (sed -f readman.sed gawk.cat) + + 2. GAWK.EXE ¤Ë¤Ä¤¤¤Æ + + gawk-2.15.4+mb1.03 ¤ò MS-C 6.00A ¤Ç¥³¥ó¥Ñ¥¤¥ë¤·¤¿¤â¤Î¤Ç¤¹. + + ¥Ç¥Õ¥©¥ë¥È¤Ç Shift-JIS ´Á»ú¥³¡¼¥É¤ò´Þ¤à¥×¥í¥°¥é¥à¤ä¥Õ¥¡¥¤¥ë¤ò½è + Íý¤Ç¤­¤Þ¤¹. + + setargv.obj ¤òÁȤ߹þ¤ó¤Ç¤¢¤ê¤Þ¤¹¤Î¤Ç, MS-DOS ¤Ç¥Ý¥Ô¥å¥é¡¼¤Ê¥¿¥¤ + ¥×¤Î¥ï¥¤¥ë¥É¥«¡¼¥É¤¬»ÈÍѤǤ­¤Þ¤¹. UNIX ¤Î csh ¥é¥¤¥¯¤Ê¥ï¥¤¥ë¥É + ¥«¡¼¥ÉŸ³«¥ë¡¼¥Á¥ó¤òÍѰդ·¤è¤¦¤«¤È¤â»×¤Ã¤¿¤Î¤Ç¤¹¤¬, MS-DOS ¤Î¾ + ¤Î¥³¥Þ¥ó¥É¤È¤ÎÀ°¹çÀ­¤¬¼è¤ì¤Ê¤¤¤·, ¥ª¥ê¥¸¥Ê¥ë¤ò¤Ê¤ë¤Ù¤¯Âº½Å¤·¤¿¤«¤Ã + ¤¿¤Î¤ÇÃÇǰ¤·¤Þ¤·¤¿. + + 3. ¥³¥Þ¥ó¥É¥é¥¤¥ó°ú¿ô¤Ë¤Ä¤¤¤Æ + + Á°½Ò¤·¤¿¤È¤ª¤ê MS-C ¤Î setargv.obj ¤ò¥ê¥ó¥¯¤·¤Æ¤¤¤Þ¤¹¤Î¤Ç, ¤½¤Î + »ÅÍͤ˽¾¤ï¤Ê¤±¤ì¤Ð¤Ê¤ê¤Þ¤»¤ó. + + £±¤Ä£±¤Ä¤Î°ú¿ô¤Ï¶õÇò¤Ç¶èÀÚ¤ê¤Þ¤¹. °ú¿ô¤Ë¶õÇò, ", \, <, >, | ¤ò + ´Þ¤à¤È¤­¤Ï¥¯¥©¡¼¥Æ¥£¥ó¥°¤¬É¬ÍפǤ¹. ¤½¤ÎÊýË¡¤Ï COMMAND.COM ¤Î¥Ð + ¥°½­¤¤»ÅÍͤÈ, ¤µ¤é¤Ë setargv.obj ¤Ë¤âÌäÂ꤬¤¢¤ê, ¤«¤Ê¤êÆñ¤·¤¤¤Î + ¤Ç¤³¤³¤Ç¤ÏÀâÌÀ¤ò¾Ê¤­¤Þ¤¹. ³Æ¼«¸¦µæ¤·¤Æ¤¯¤À¤µ¤¤. °ìÈÖ´Êñ¤Ê¤Î¤Ï, + gawk ¤Î¥×¥í¥°¥é¥à¤ò¥Õ¥¡¥¤¥ë¤Ë¤·¤Æ + + gawk -f ¥Õ¥¡¥¤¥ë̾ + + ¤È¤¹¤ë¤³¤È¤Ç¤¹. + + 4. ´Ä¶­ + + AWKPATH ¤¬¥»¥Ã¥È¤µ¤ì¤Æ¤¤¤ì¤Ð, ¤½¤ÎÃͤò¥Ç¥£¥ì¥¯¥È¥ê¤Î¥ê¥¹¥È¤È¤ß¤Ê + ¤·, -f ¤Ç»ØÄꤵ¤ì¤¿¥Õ¥¡¥¤¥ë¤ò¤½¤Î¥Ç¥£¥ì¥¯¥È¥ê¤«¤éõ¤·¤Þ¤¹. PATH + ¤Ê¤É¤ÈƱÍͤË, ¤½¤ì¤¾¤ì¤Î¥Ç¥£¥ì¥¯¥È¥ê¤Ï¥»¥ß¥³¥í¥ó (;) ¤Ç¶èÀÚ¤ê¤Þ + ¤¹. + + 5. ¥Þ¥Ë¥å¥¢¥ë + + roff ·Ï¤Î¥Õ¥©¡¼¥Þ¥Ã¥¿¤ò»È¤¨¤Ê¤¤¿Í¤Î¤¿¤á¤Ë GNU roff ¤Ç¥Õ¥©¡¼¥Þ¥Ã + ¥ÈºÑ¤ß¤Î¥Þ¥Ë¥å¥¢¥ë¤òÍѰդ·¤Þ¤·¤¿. ¥Ü¡¼¥ë¥É¥Õ¥§¡¼¥¹, ¥¢¥ó¥À¡¼¥é + ¥¤¥óÂбþ¤Î less ¤Ê¤É¤Ç¤ªÆÉ¤ß¤¯¤À¤µ¤¤. ¥¨¥Ç¥£¥¿¤Ê¤É¤Ç¤Ï ^H ¤¬Æþ¤Ã + ¤Æ¤¤¤ÆÆÉ¤ß¤Ë¤¯¤¤¤È»×¤¤¤Þ¤¹. + + s/.^H//g + + ¤È¤¤¤¦ sed ¤Î¥×¥í¥°¥é¥à¤ËÄ̤»¤Ð, Ä̾ï¤Î¥Æ¥­¥¹¥È¥Õ¥¡¥¤¥ë¤¬ÆÀ¤é¤ì + ¤Þ¤¹. (^H ¤È¤¤¤¦¤Î¤Ï¥³¥ó¥È¥í¡¼¥ë¥³¡¼¥É¤òľÀÜËä¤á¤³¤à¤È¤¤¤¦°ÕÌ£ + ¤Ç¤¹.) + + ¡ü¥¤¥ó¥¹¥È¡¼¥ë (MS-DOS °Ê³°) + + ¥Ç¥Õ¥©¥ë¥È¤Î¥Þ¥ë¥Á¥Ð¥¤¥Èʸ»ú¤ÎÀßÄê¤Ï, Makefile.in ¤ÎÃæ¤Ç»ØÄꤷ¤Þ¤¹. + ¥Ç¥Õ¥©¥ë¥È¤ò Shift-JIS ¤È¤¹¤ë¾ì¹ç¤È, ¥Ç¥Õ¥©¥ë¥È¤Ç¥Þ¥ë¥Á¥Ð¥¤¥Èʸ»ú¤ò + »ÈÍѤ·¤Ê¤¤¾ì¹ç¤Ï Makefile.in ¤Î MBCTYPE_DEF ¥Þ¥¯¥í¤ÎÄêµÁ¤ò¤½¤ì¤¾¤ì°Ê + ²¼¤Î¤è¤¦¤ËÊѤ¨¤Æ¤¯¤À¤µ¤¤. + + MBCTYPE_DEF = -DSJIS (¥Ç¥Õ¥©¥ë¥È¤Ç Shift-JIS ¤Î¾ì¹ç) + MBCTYPE_DEF = (¥Ç¥Õ¥©¥ë¥È¤Ç»ÈÍѤ·¤Ê¤¤¾ì¹ç) + + ¤¤¤º¤ì¤Î¾ì¹ç¤Ç¤âµ¯Æ°»þ¤Î¥ª¥×¥·¥ç¥ó¤Ë¤è¤ê¥Þ¥ë¥Á¥Ð¥¤¥Èʸ»ú¥³¡¼¥É¤ÎÁªÂò + ¤¬²Äǽ¤Ç¤¹. + + memmove ´Ø¿ô¤¬¥é¥¤¥Ö¥é¥ê¤Ë¤Ê¤¤¥·¥¹¥Æ¥à¤Ç¤Ï¥ê¥ó¥¯»þ¤Ë¥¨¥é¡¼¤¬½Ð¤ë¤³¤È + ¤¬¤¢¤ê¤Þ¤¹. ¤½¤Î¤È¤­¤Ï config/* ¤ÎÂбþ¤¹¤ë¥Õ¥¡¥¤¥ë¤Ë + + MEMMOVE_MISSING 1 + + ¤È¤¤¤¦¹Ô¤òÄɲä·¤Æ configure ¤ò¤ä¤êľ¤·¤Æ¤¯¤À¤µ¤¤. + + ¤½¤Î¾¤Îºî¶È¤Ï, ¥ª¥ê¥¸¥Ê¥ë¤Î gawk ¤ÈƱÍͤǤ¹. + + ¡ü¥¤¥ó¥¹¥È¡¼¥ë (MS-DOS ÈÇ. ¤³¤³¤Ç¤¤¤¦¥¤¥ó¥¹¥È¡¼¥ë¤È¤Ï, ¥½¡¼¥¹¤«¤é¤Î¥¤¥ó + ¥¹¥È¡¼¥ë¤Î¤³¤È¤Ç¤¹) + + ¤Þ¤º, pc/* ¤ò, ¤³¤Î¥Ç¥£¥ì¥¯¥È¥ê¤Ë¥³¥Ô¡¼¤·¤Æ¤¯¤À¤µ¤¤. + + A>copy pc\*.* . + + MS-C 6.00A ¤ò»ÈÍѤ·¤Æ, ¥Ç¥Õ¥©¥ë¥È¤Ç Shift-JIS ¤òǧ¼±¤¹¤ë gawk ¤òºîÀ® + ¤¹¤ë¾ì¹ç¤Ï, README.MSC ¤ËÌܤòÄ̤·¤Æ, ɬÍפʤé¥é¥¤¥Ö¥é¥ê¤Ë¥Ñ¥Ã¥Á¤òÅö + ¤Æ, + + A>nmake -f Makefile.msc DEFS= + + ¤È¤¹¤ë¤À¤±¤Ç£Ï£Ë¤Ç¤¹. ¥ª¥×¥Æ¥£¥Þ¥¤¥º¤Ï -Ox ¤Þ¤ÇÂç¾æÉפʤ褦¤Ç¤¹. ¥³ + ¥ó¥Ñ¥¤¥ë¤¬Ìµ»ö½ªÎ»¤·¤¿¤é, + + A>nmake -f Makefile.msc test + + ¤ÈÆþÎϤ·¤Æ, ưºî¥Á¥§¥Ã¥¯¤ò¤·¤Æ¤ß¤Æ¤¯¤À¤µ¤¤. + + ¤½¤Î¾¤Î½èÍý·Ï¤ò»ÈÍѤ¹¤ë¾ì¹ç¤ä, ¥Ç¥Õ¥©¥ë¥È¤ò Shift-JIS °Ê³°¤Ë¤¹¤ë¾ì + ¹ç¤Ï Makefile.msc ¤ò½ñ¤­´¹¤¨¤Æ»ÈÍѤ·¤Æ¤¯¤À¤µ¤¤. + + ¤Þ¤¿, º£²ó¤«¤é GO32 ¤ò¥µ¥Ý¡¼¥È¤·¤Æ¤¤¤Þ¤¹. + + A>copy pc\*.* . + A>del config.h + A>ren config.go3 config.h + A>make -f Makefile.go3 + + ¤È¤¹¤ì¤Ð¥³¥ó¥Ñ¥¤¥ë¤µ¤ì¤ë¤Ï¤º¤Ç¤¹. DJGPP 1.11 ¤Ç¤Î¤ß³ÎǧºÑ¤ß¤Ç¤¹. ¤¿ + ¤À¤·, DJGPP ÈÇ GNU make ¤ò»ÈÍѤ·¤¿¾ì¹ç, + + A>make -f Makefile.go3 test + + ¤¬²¿¸Î¤«¤³¤±¤Þ¤¹. (ñ¤Ë»ä¤ÎÀßÄ꤬°­¤¤¤À¤±¤«¤âÃΤì¤Þ¤»¤ó¤¬.) ¤½¤Î¤È¤­ + ¤Ï test/Makefile.dos ¤Î¤³¤±¤¿Éôʬ¤ò¸«¤Æ, ¼êư¤Ç¤ä¤Ã¤Æ¤ß¤Æ¤¯¤À¤µ¤¤. + + ¤Ê¤ª, ¤³¤Îưºî¥Á¥§¥Ã¥¯¤ò¹Ô¤¦¾ì¹ç¤Ï, rm, wc ¤Ê¤É¤Î¥Ä¡¼¥ë¤¬É¬ÍפǤ¹. + »ý¤Ã¤Æ¤¤¤Ê¤¤Êý¤Ï test/Makefile.dos ¤ò½ñ¤­´¹¤¨¤ë¤Ê¤ê, rm, wc, etc. ¤ò + ºîÀ®¤¹¤ë¤Ê¤ê¤·¤Æ¤·¤Î¤¤¤Ç¤¯¤À¤µ¤¤. redir ¤È¤¤¤¦¸«´·¤ì¤Ê¤¤¥×¥í¥°¥é¥à¤â + »ÈÍѤ·¤Æ¤¤¤Þ¤¹¤¬, ¤³¤ì¤Ï, ɸ½à¥¨¥é¡¼½ÐÎϤò¥ê¥À¥¤¥ì¥¯¥È¤·¤¿¤ê¤¹¤ë¤¿¤á + ¤Ë¨¶½¤Çºî¤Ã¤¿¥×¥í¥°¥é¥à¤Ç¤¹. ¤³¤ì¤Î¥½¡¼¥¹¤Ï¤ª¤Þ¤±¤È¤·¤Æ pc/ ¥Ç¥£¥ì + ¥¯¥È¥ê¤ËÆþ¤ì¤Æ¤ª¤­¤Þ¤·¤¿. ¤·¤ç¤¦¤â¤Ê¤¤¥×¥í¥°¥é¥à¤Ç¤¹¤Î¤Ç·è¤·¤Æ¥³¡¼¥É + ¤òÇÁ¤­¸«¤·¤¿¤ê¤·¤Ê¤¤¤è¤¦¤Ë(¾Ð). ´Ê°×¥Þ¥Ë¥å¥¢¥ë¤ò pc/redir.txt ¤Ë½ñ¤¤ + ¤Æ¤¤¤Þ¤¹. + + ¡ü¥Ð¥° + + 1. ¤¤¤ï¤æ¤ë JIS ¤Ë¤ÏÂбþ¤·¤Æ¤¤¤Þ¤»¤ó. ¾­ÍèÂбþ¤¹¤ëͽÄê¤â¤¢¤ê¤Þ¤»¤ó. + + 2. ¥Þ¥ë¥Á¥Ð¥¤¥Èʸ»ú¥³¡¼¥É¤Ï¤¢¤Þ¤ê¸·³Ê¤Ë¤Ï¹Í¤¨¤Æ¤¤¤Þ¤»¤ó. + + EUC £±¥Ð¥¤¥ÈÌÜ ... 0x80 - 0xff + EUC £²¥Ð¥¤¥ÈÌÜ ... 0x01 - 0xff (0x0a ¤ò½ü¤¯) + + Shift-JIS £±¥Ð¥¤¥ÈÌÜ ... 0x80 - 0x9f, 0xe0 - 0xff + Shift-JIS £²¥Ð¥¤¥ÈÌÜ ... 0x01 - 0xff (0x0a ¤ò½ü¤¯) + + ¤È¤·¤Æ½èÍý¤·¤Æ¤¤¤Þ¤¹. Ⱦ³Ñ¥«¥Ê¤â»È¤¨¤ë¤Ï¤º¤Ç¤¹. EUC ¤Î SS3 + (0x8f) ¤Ë»Ï¤Þ¤ë£³¥Ð¥¤¥È¥³¡¼¥É¤Ï»È¤¨¤Þ¤»¤ó. (»ä¤Ï¤³¤ì¤ò¥µ¥Ý¡¼¥È¤· + ¤Æ¤¤¤ë¥·¥¹¥Æ¥à¤ò¸«¤¿¤³¤È¤¬¤Ê¤¤...) + + ¡ü¥¢¥ë¥´¥ê¥º¥à (dfa.[ch] ¤Î¥Þ¥ë¥Á¥Ð¥¤¥Èʸ»úÂбþ²½) + + °ÊÁ°¤ÏÇùÁ³¤È, DFA ¤òľÀÜ EUC ¤ä Shift-JIS ¤Î¤è¤¦¤Êʸ»ú¼ï¤Î¿¤¤¥³¡¼¥É + ¥»¥Ã¥È¤ËÂбþ¤µ¤»¤ë¤Î¤Ï, Èó¾ï¤ËÆñ¤·¤¤¤È»×¤Ã¤Æ¤¤¤Þ¤·¤¿. ¤È¤³¤í¤¬¤¢¤ë + Æü, ¼«ºî¥é¥¤¥Ö¥é¥ê¤Î¥Æ¥¹¥ÈÍѤË, Àµµ¬É½¸½¤ò DFA ¤ØÊÑ´¹¤¹¤ë´Êñ¤Ê¥×¥í + ¥°¥é¥à¤ò½ñ¤¤¤¿¤È¤­¤Ë, ÆÍÁ³¤¦¤Þ¤¤¥¢¥¤¥Ç¥£¥¢¤¬Á®¤¤¤¿¤Î¤Ç¤¹. ¥Þ¥ë¥Á¥Ð + ¥¤¥Èʸ»ú¤È¤¤¤¨¤É¤â·ë¶É¤Ï¥Ð¥¤¥È¤ÎʤӤǤ¹. ¥Þ¥ë¥Á¥Ð¥¤¥Èʸ»ú¤ò, ¤¹¤Ù + ¤Æ¥Ð¥¤¥Èñ°Ì¤Ëʬ²ò¤·¤Æ, Àµµ¬É½¸½¤òºî¤Ã¤Æ¤·¤Þ¤¨¤Ð¤è¤«¤Ã¤¿¤Î¤Ç¤¹. + + ¸ÀÍդǤϤ¦¤Þ¤¯É½¸½¤Ç¤­¤Ê¤¤¤Î¤Ç, °Ê²¼¤Îµ­¹æ¤ò»ÈÍѤ·, ¤É¤¦¤¤¤¦¤Õ¤¦¤Ë¥Ð + ¥¤¥Èñ°Ì¤Ëʬ²ò¤·¤Æ¤¤¤ë¤Î¤«, Îã¤òµó¤²¤Þ¤¹. + + a, b, c ... ¥·¥ó¥°¥ë¥Ð¥¤¥Èʸ»ú. + x, y, z ... ¥Þ¥ë¥Á¥Ð¥¤¥Èʸ»ú¤Î£±Ê¸»úÌÜ. + + . (Ǥ°Õ¤Î£±Ê¸»ú) + ==> [a-c]|[x-z][a-z] + + (¥·¥ó¥°¥ë¥Ð¥¤¥Èʸ»ú¤«, ¤Þ¤¿¤Ï¥Þ¥ë¥Á¥Ð¥¤¥Èʸ»ú¤Î£±Ê¸»úÌÜ¤È + Ǥ°Õ¤Î£±Ê¸»ú¤ÎÏ¢ÀÜ.) + + [xb-zx] (xb ¤«¤é zx ¤ÎÈϰϤΥޥë¥Á¥Ð¥¤¥Èʸ»ú + ==> x[b-z]|y[a-z]|z[a-x] + + yb* + ==> (yb)* + + ¼ÂºÝ¤Ë¤ÏÀµµ¬É½¸½¤òºî¤ê½Ð¤¹¤Î¤Ç¤Ï¤Ê¤¯, Àµµ¬É½¸½¤òʬ²ò¤·¤¿¥È¡¼¥¯¥ó¤òľ + ÀÜÀ¸À®¤·¤Æ¤¤¤Þ¤¹. ¤³¤ÎÊÕ, ¶½Ì£¤¬¤¢¤ëÊý¤Ï¥½¡¼¥¹¤ò¸«¤¿¤Û¤¦¤¬Áᤤ¤È»× + ¤¤¤Þ¤¹. (¤¢¤Þ¤ê¥¨¥ì¥¬¥ó¥È¤Ç¤Ï¤¢¤ê¤Þ¤»¤ó¤Î¤Ç¥½¡¼¥¹¤ò¤¸¤Ã¤¯¤ê¸«¤é¤ì¤ë + ¤Î¤ÏÃѤº¤«¤·¤¤µ¤¤â¤·¤Þ¤¹¤¬...) + + ¤³¤ì¤À¤±¤Ç¤Ï, Î㤨¤Ð¤¢¤ë¥Æ¥­¥¹¥È¤«¤é xy ¤È¤¤¤¦Ê¸»ú¤òõ¤½¤¦¤È¤¹¤ë¤È, + xxyy ¤Î¤è¤¦¤Êʸ»ú¤ÎʤӤˤޤÇÈ¿±þ¤·¤Æ¤·¤Þ¤¤¤Þ¤¹. ¤½¤³¤Ç, ¥Þ¥ë¥Á¥Ð¥¤ + ¥È¥â¡¼¥É¤Î¤È¤­¤Ë¤Ïɬ¤º "^.*(" + ¥æ¡¼¥¶¥Ñ¥¿¡¼¥ó + ")" ¤È¤·¤Æ½èÍý¤·¤Þ + ¤¹. '.*' ¤Ë¤è¤ê, '.' ¤Ï¥Þ¥ë¥Á¥Ð¥¤¥Èʸ»ú¤Î°ìÉô¤Ë¤Ï¥Þ¥Ã¥Á¤·¤Þ¤»¤ó¤«¤é, + Ƭ½Ð¤·¤Ç¤­¤ë¤ï¤±¤Ç¤¹. + + ¡ü dfa.[ch], regex.[ch] ¤Î³ÈÄ¥»ÅÍÍ (¾¤Î¥¢¥×¥ê¥±¡¼¥·¥ç¥ó¤Ø±þÍѤ·¤¿¤¤Êý¤Ø) + + dfa.[ch], regex.[ch] ¥â¥¸¥å¡¼¥ë¤Ï mbc.[ch] ¥â¥¸¥å¡¼¥ë¤Ë°Í¸¤·¤Æ¤¤¤Þ + ¤¹. ¤Þ¤¿, ¤³¤ì¤Ï¥ª¥ê¥¸¥Ê¥ë¤Î»ÅÍͤǤ¹¤¬, dfa.[ch] ¤ò»ÈÍѤ¹¤ë¾ì¹ç¤Ï + regex.h ¤ÎÄêµÁ¤¬É¬ÍפǤ¹. + + ¥Þ¥ë¥Á¥Ð¥¤¥Èʸ»ú¤Î¥¿¥¤¥×¤Ï, mbc.[ch] ¤Î mbcinit() ¤ÇÀßÄꤷ¤Þ¤¹. + mbc.h ¤ËÄêµÁ¤µ¤ì¤Æ¤¤¤ë¥Þ¥¯¥í MBCTYPE_ASCII, MBCTYPE_EUC, + MBCTYPE_SJIS ¤Î¤¤¤º¤ì¤«¤ò mbcinit() ¤ËÅϤ·¤Æ¤¯¤À¤µ¤¤. + + dfa.[ch] ¤Ï, ¥Ñ¥¿¡¼¥ó¤Î¥³¥ó¥Ñ¥¤¥ë»þ¤Ë¤À¤±, ¤³¤Î mbc.[ch] ¤ÎÀßÄê¤ò»² + ¾È¤·¤Þ¤¹. ¥Ñ¥¿¡¼¥ó¥Þ¥Ã¥Á¥ó¥°¤ÎºÝ¤Ï, ¥³¥ó¥Ñ¥¤¥ë»þ¤ËÀßÄꤵ¤ì¤Æ¤¤¤¿, + ¥Þ¥ë¥Á¥Ð¥¤¥Èʸ»ú¤Î¥¿¥¤¥×¤ò¸¡º÷¤·¤Þ¤¹. + + °ìÊý, regex.[ch] ¤Ï, ¥Ñ¥¿¡¼¥ó¥³¥ó¥Ñ¥¤¥ë»þ, ¥Þ¥Ã¥Á¥ó¥°»þ¤ÎξÊý¤Ç + mbc.[ch] ¤ÎÀßÄê¤ò»²¾È¤·¤Þ¤¹. ¤¬, ¤³¤Îξ¼Ô¤Ç mbc.[ch] ¤ÎÀßÄê¤òÊѹ¹¤¹ + ¤ë¤³¤È¤Ï¤Ç¤­¤Þ¤»¤ó. ¤Ä¤Þ¤ê, Shift-JIS ¤Çµ­½Ò¤µ¤ì¤¿¥Ñ¥¿¡¼¥ó¤ò, EUC + ¥Æ¥­¥¹¥È¤«¤é¸¡º÷¤¹¤ë¤È¤¤¤Ã¤¿Æ°ºî¤Ï¤Ç¤­¤Þ¤»¤ó. Ãí°Õ¤·¤Æ¤¯¤À¤µ¤¤. + + ¥Þ¥ë¥Á¥Ð¥¤¥Èʸ»úÂбþ¤Ëȼ¤Ã¤ÆÃí°Õ¤¹¤Ù¤­Àµµ¬É½¸½¤ò°Ê²¼¤Ëµ­¤·¤Þ¤¹. + + . Ǥ°Õ¤Î£±¥Ð¥¤¥Èʸ»ú, ÀµÅö¤Ê¥Þ¥ë¥Á¥Ð¥¤¥Èʸ»ú¤Ë¥Þ¥Ã¥Á¤·¤Þ¤¹. + ¡ÖÀµÅö¤Ê¥Þ¥ë¥Á¥Ð¥¤¥Èʸ»ú¡×¤È¤Ï, ¥Þ¥ë¥Á¥Ð¥¤¥Èʸ»ú¤Î£±Ê¸»ú + ÌܤË, '\0' ¤Þ¤¿¤Ï '\n' °Ê³°¤¬Â³¤¯Ê¸»ú¤Î¤³¤È¤Ç¤¹. + + [x-y] ʸ»ú¥³¡¼¥É (ÆâÉôɽ¸½) ¤¬ x ¤«¤é y ¤ÎÈϰϤˤ¢¤ëǤ°Õ¤Î£±Ê¸ + »ú¤Ë¥Þ¥Ã¥Á¤·¤Þ¤¹. ¤³¤ì¤â . ¤ÈƱ¤¸¤¯, ÀµÅö¤Ç¤Ê¤¤Ê¸»ú¤Ë¤Ï + ¥Þ¥Ã¥Á¤·¤Þ¤»¤ó. + + [^x-y] ʸ»ú¥³¡¼¥É (ÆâÉôɽ¸½) ¤¬ x ¤«¤é y ¤ÎÈϰϤˤʤ¤Ç¤°Õ¤Î£±Ê¸ + »ú¤Ë¥Þ¥Ã¥Á¤·¤Þ¤¹. ÀµÅö¤Ç¤Ê¤¤Ê¸»ú¤Ë¤â¥Þ¥Ã¥Á¤·¤Þ¤¹. + + ¥Þ¥ë¥Á¥Ð¥¤¥Èʸ»ú¤ÎÆâÉôɽ¸½¤Ïñ¤Ë£±¥Ð¥¤¥ÈÌܤò¾å°Ì¥Ð¥¤¥È, £²¥Ð¥¤¥ÈÌܤò + ²¼°Ì¥Ð¥¤¥È¤È¤·¤¿£±£¶¥Ó¥Ã¥ÈÉ乿¤Ê¤·À°¿ô¤Ç¤¹. Shift-JIS ¤Ç¤â EUC ¤Ç¤â + + £±¥Ð¥¤¥È ASCII ʸ»ú < Ⱦ³Ñ¥«¥Êʸ»ú < Á´³Ñʸ»ú + + ¤È¤¤¤¦Âç¾®´Ø·¸¤¬À®¤êΩ¤Ã¤Æ¤¤¤Þ¤¹. + + ¡ü¾ò·ï¤Ê¤É + + 1. ¥ª¥ê¥¸¥Ê¥ë¤Î GNU awk ¤ÎÃøºî¸¢¤Ï Free Software Foundation, Inc. ¤¬ + Í­¤·¤Æ¤¤¤Þ¤¹. ¥Ñ¥Ã¥ÁÉôʬ (gawk-mb.diff) ¤ÎÃøºî¸¢¤Ï»ä (t^2) ¤¬Í­¤· + ¤Æ¤¤¤Þ¤¹. + + 2. GNU awk ¤Î¥½¡¼¥¹¥³¡¼¥É¤Ï³Æ½ê¤Î ftp ¥µ¥¤¥È, ¤â¤·¤¯¤Ï Nifty-serve + ¤Î FUNIX ¤Î¥Ç¡¼¥¿¥é¥¤¥Ö¥é¥ê¤«¤éÆþ¼ê²Äǽ¤Ç¤¹. GNU awk ¤«¤é gawk+mb + ¤Ø¤Îº¹Ê¬ gawk-mb.diff ¤Ï, »ä¤¬ FUNIX ¤ØÅÐÏ¿¤·, Ʋ±àÏÂϺ»á + (dohzono@sdsft.kme.mei.co.jp) ¤¬ fj.sources ¤Ø¥Ý¥¹¥È¤·¤Æ¤¯¤À¤µ¤Ã + ¤Æ¤¤¤Þ¤¹. + + 3. º¹Ê¬ gawk-mb.diff ¤ÎºÆÇÛÉۤϼ«Í³¤Ç¤¹. ¤³¤ì¤Ë´Ø¤·¤Æ¤Ï FSF ¤Îµ¬Äê¤Ë + ½¾¤¦É¬Íפ⤢¤ê¤Þ¤»¤ó. ¤·¤«¤·º¹Ê¬¤òŬÍѤ·¤¿·ë²Ì¤Î¥½¡¼¥¹¥³¡¼¥É, ¤ª + ¤è¤Ó¼Â¹Ô·Á¼°¤Ç¤ÎºÆÇÛÉÛ¤ÎºÝ¤Ï GNU GENERAL PUBLIC LICENSE (COPYING + »²¾È) ¤Ë½¾¤Ã¤Æ¤¯¤À¤µ¤¤. + + gawk+mb ¤Ë²¿¤é¤«¤Î²þÊѤò²Ã¤¨¤¿¤â¤Î¤òºÆÇÛÉÛ¤¹¤ëºÝ¤â, GNU GENERAL + PUBLIC LICENSE ¤Ë½¾¤¦¤è¤¦¤ËÃí°Õ¤·¤Æ¤¯¤À¤µ¤¤. ¤Þ¤¿ gawk+mb ¤Ë´Þ¤Þ + ¤ì¤ë¥³¡¼¥É (dfa.[ch] ¤ä regex.[ch] ¤Ê¤É) ¤òÍøÍѤ·¤¿¥×¥í¥°¥é¥à¤òÇÛ + ÉÛ¤¹¤ëºÝ¤â GNU GENERAL PUBLIC LICENSE ¤Î³ºÅöÉôʬ¤Ë½¾¤Ã¤Æ¤¯¤À¤µ¤¤. + + ¤Þ¤¿µÁ̳¤Ç¤Ï¤¢¤ê¤Þ¤»¤ó¤¬ºÆÇÛÉÛ¤µ¤ì¤ëÊý¤Ï»ö¸å¤Ë¤Ç¤âÏ¢Íí¤ò¤¯¤À¤µ¤¤. + ¤½¤·¤Æ²Äǽ¤Ê¸Â¤ê, ¿·¤·¤¤¥Ð¡¼¥¸¥ç¥ó¤Ø¤Î¥¢¥Ã¥×¥Ç¡¼¥È¤ËÅØ¤á, ÍøÍÑ¼Ô + ¤«¤é¤ÎÏ¢Íí¤¬»ä¤ËÆÏ¤¯¤è¤¦¤ËÇÛθ¤·¤Æ¤¯¤À¤µ¤¤. + + 4. ¤³¤Î¥×¥í¥°¥é¥à¤Ï̵ÊݾڤǤ¹. + + 5. gawk+mb ¤Ë²¿¤é¤«¤ÎÉÔ¶ñ¹ç¤¬È¯À¸¤·¤¿¾ì¹ç, (FSF ¤ä, ¥ª¥ê¥¸¥Ê¥ë¤Îºî¼Ô + ¤Ç¤Ï¤Ê¤¯) »ä¤ËÏ¢Íí¤·¤Æ¤¯¤À¤µ¤¤. ÇÛÉÛ¤·¤¿¿Í¤¬´õ˾¤·¤Æ¤¤¤ë¾ì¹ç¤Ï, + ¤½¤Î¿Í¤ËÏ¢Íí¤·¤Æ¤¯¤À¤µ¤¤. + + 6. ¤´¼ÁÌä/¤´Í×˾/¤ª¼¸¤ê, ¤½¤Î¾¤âÂç´¿·Þ¤Ç¤¹. ¤Ç¤­¤ë¤«¤®¤ê¥µ¥Ý¡¼¥È¤· + ¤Þ¤¹. + + ¡ü¼Õ¼­ + + ¸¶ºî¼Ô¤ª¤è¤Ó FSF ¤Ë´¶¼Õ¤·¤Þ¤¹. + + ¤³¤Î¥É¥­¥å¥á¥ó¥ÈºîÀ®¤Ë´Ø¤·¤ÆÂ¿¤¯¤Î½õ¸À¤ò¤¯¤À¤µ¤Ã¤¿Æ²±àÏÂϺ»á + ¤Ë´¶¼Õ¤·¤Þ¤¹. ¤Þ¤¿, fj.sources ¤Ç¤ÎÇÛ + Éۤˤ⤴¿ÔÎÏ夤¤Æ¤¤¤Þ¤¹. + + ¤³¤ì¤Þ¤ÇžºÜ/¥Ð¥°Êó¹ð¤ò¤¯¤À¤µ¤Ã¤¿Êý¡¹¤Ë´¶¼Õ¤·¤Þ¤¹. ¼Â̾¤òµó¤²¤µ¤»¤Æ + 夭¤¿¤«¤Ã¤¿¤Î¤Ç¤¹¤¬¥Ï¡¼¥É¥Ç¥£¥¹¥¯¤Î¥È¥é¥Ö¥ë¤Ç¤Û¤È¤ó¤É¤Î¥á¡¼¥ë¤ò¾Ã¼º + ¤µ¤»¤Æ¤·¤Þ¤¤¤Þ¤·¤¿. (¥Ð¥°Êó¹ðʬ¤Ë´Ø¤·¤Æ¤Ï ChangeLog.MB ¤Ë»Ä¤Ã¤Æ¤¤¤Þ + ¤¹.) + + ºÇ¸å¤Ë, µ®½Å¤Ê¥Ç¥£¥¹¥¯¥¹¥Ú¡¼¥¹¤ò gawk+mb ¤Î¤¿¤á¤Ë³ä¤¤¤Æ¤´»ÈÍÑ夤¤Æ + ¤¤¤ë¤¹¤Ù¤Æ¤ÎÍøÍѼԤÎÊý¡¹¤Ë´¶¼Õ¤¤¤¿¤·¤Þ¤¹. + + ¡ü¡Ö»ä¡×¤ÎÏ¢ÍíÀè + + ¢©810 Ê¡²¬»ÔÃæ±û¶èÇ߸÷±àÃÄÃÏ 7-207 (Ãí: žµï¤·¤Þ¤·¤¿) + TEL/FAX: 092-731-4025 (TEL/FAX ¼«Æ°ÀÚÂØ¤¨) + 092-724-6342 (TEL ¤Î¤ß) + E-mail: NBC02362@niftyserve.or.jp ëËܹ§¹À + + # Local variables: + # mode: indented-text + # indent-tabs-mode: nil + # tab-stop-list: (4 8 16 24 32 40 48 56 64 72 80) + # left-margin: 4 + # fill-column: 72 + # fill-prefix: " " + # version-control: never + # End: diff -crP php-2.0.1/src/jp.regex/dfa.c php-2.0.1.jp_urat-5.3/src/jp.regex/dfa.c *** php-2.0.1/src/jp.regex/dfa.c Thu Jan 1 09:00:00 1970 --- php-2.0.1.jp_urat-5.3/src/jp.regex/dfa.c Wed Feb 18 21:03:40 1998 *************** *** 0 **** --- 1,2865 ---- + /* dfa.c - deterministic extended regexp routines for GNU + Copyright (C) 1988 Free Software Foundation, Inc. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ + + /* Written June, 1988 by Mike Haertel + Modified July, 1988 by Arthur David Olson to assist BMG speedups */ + /* Multi-byte extension added May, 1993 by t^2 (Takahiro Tanimoto) + Last change: Aug. 28, 1994 by t^2 */ + + #include + #include + #include + + #ifdef HAVE_CONFIG_H + #include "config.h" + #endif + + #ifdef STDC_HEADERS + #include + #else + #include + extern char *calloc(), *malloc(), *realloc(); + extern void free(); + #endif + + #if defined(HAVE_STRING_H) || defined(STDC_HEADERS) + #include + #undef index + #define index strchr + #undef bcopy + #define bcopy(s, d, n) memcpy(d, s, n) + #undef bzero + #define bzero(d, n) memset(d, 0, n) + #else + #include + #endif + + #ifndef DEBUG /* use the same approach as regex.c */ + #undef assert + #define assert(e) + #endif /* DEBUG */ + + #ifndef isgraph + #define isgraph(C) (isprint(C) && !isspace(C)) + #endif + + #ifdef isascii + #define ISALPHA(C) (isascii(C) && isalpha(C)) + #define ISUPPER(C) (isascii(C) && isupper(C)) + #define ISLOWER(C) (isascii(C) && islower(C)) + #define ISDIGIT(C) (isascii(C) && isdigit(C)) + #define ISXDIGIT(C) (isascii(C) && isxdigit(C)) + #define ISSPACE(C) (isascii(C) && isspace(C)) + #define ISPUNCT(C) (isascii(C) && ispunct(C)) + #define ISALNUM(C) (isascii(C) && isalnum(C)) + #define ISPRINT(C) (isascii(C) && isprint(C)) + #define ISGRAPH(C) (isascii(C) && isgraph(C)) + #define ISCNTRL(C) (isascii(C) && iscntrl(C)) + #else + #define ISALPHA(C) isalpha(C) + #define ISUPPER(C) isupper(C) + #define ISLOWER(C) islower(C) + #define ISDIGIT(C) isdigit(C) + #define ISXDIGIT(C) isxdigit(C) + #define ISSPACE(C) isspace(C) + #define ISPUNCT(C) ispunct(C) + #define ISALNUM(C) isalnum(C) + #define ISPRINT(C) isprint(C) + #define ISGRAPH(C) isgraph(C) + #define ISCNTRL(C) iscntrl(C) + #endif + + #include "regex.h" + #include "dfa.h" + #include "mbc.h" + + #ifdef __STDC__ + typedef void *ptr_t; + #else + typedef char *ptr_t; + #endif + + static void dfamust _RE_ARGS((struct dfa *dfa)); + + static ptr_t xcalloc _RE_ARGS((size_t n, size_t s)); + static ptr_t xmalloc _RE_ARGS((size_t n)); + static ptr_t xrealloc _RE_ARGS((ptr_t p, size_t n)); + #ifdef DEBUG + static void prtok _RE_ARGS((token t)); + #endif + static int tstbit _RE_ARGS((int b, charclass c)); + static void setbit _RE_ARGS((int b, charclass c)); + static void clrbit _RE_ARGS((int b, charclass c)); + static void copyset _RE_ARGS((charclass src, charclass dst)); + static void zeroset _RE_ARGS((charclass s)); + static void notset _RE_ARGS((charclass s)); + static int equal _RE_ARGS((charclass s1, charclass s2)); + static int charclass_index _RE_ARGS((charclass s)); + static int looking_at _RE_ARGS((const char *s)); + static token lex _RE_ARGS((void)); + static void addtok _RE_ARGS((token t)); + static void atom _RE_ARGS((void)); + static int nsubtoks _RE_ARGS((int tindex)); + static void copytoks _RE_ARGS((int tindex, int ntokens)); + static void closure _RE_ARGS((void)); + static void branch _RE_ARGS((void)); + static void regexp _RE_ARGS((int toplevel)); + static void copy _RE_ARGS((position_set *src, position_set *dst)); + static void insert _RE_ARGS((position p, position_set *s)); + static void merge _RE_ARGS((position_set *s1, position_set *s2, position_set *m)); + static void delete _RE_ARGS((position p, position_set *s)); + static int state_index _RE_ARGS((struct dfa *d, position_set *s, + int newline, int letter)); + static void build_state _RE_ARGS((int s, struct dfa *d)); + static void build_state_zero _RE_ARGS((struct dfa *d)); + static char *icatalloc _RE_ARGS((char *old, char *new)); + static char *icpyalloc _RE_ARGS((char *string)); + static char *istrstr _RE_ARGS((char *lookin, char *lookfor)); + static void ifree _RE_ARGS((char *cp)); + static void freelist _RE_ARGS((char **cpp)); + static char **enlist _RE_ARGS((char **cpp, char *new, size_t len)); + static char **comsubs _RE_ARGS((char *left, char *right)); + static char **addlists _RE_ARGS((char **old, char **new)); + static char **inboth _RE_ARGS((char **left, char **right)); + + static ptr_t + xcalloc(n, s) + size_t n; + size_t s; + { + ptr_t r = calloc(n, s); + + if (!r) + fprintf(stderr,"Memory exhausted"); + return r; + } + + static ptr_t + xmalloc(n) + size_t n; + { + ptr_t r = malloc(n); + + assert(n != 0); + if (!r) + fprintf(stderr,"Memory exhausted"); + return r; + } + + static ptr_t + xrealloc(p, n) + ptr_t p; + size_t n; + { + ptr_t r = realloc(p, n); + + assert(n != 0); + if (!r) + fprintf(stderr,"Memory exhausted"); + return r; + } + + #define CALLOC(p, t, n) ((p) = (t *) xcalloc((size_t)(n), sizeof (t))) + #define MALLOC(p, t, n) ((p) = (t *) xmalloc((n) * sizeof (t))) + #define REALLOC(p, t, n) ((p) = (t *) xrealloc((ptr_t) (p), (n) * sizeof (t))) + + /* Reallocate an array of type t if nalloc is too small for index. */ + #define REALLOC_IF_NECESSARY(p, t, nalloc, index) \ + if ((index) >= (nalloc)) \ + { \ + while ((index) >= (nalloc)) \ + (nalloc) *= 2; \ + REALLOC(p, t, nalloc); \ + } + + #ifdef DEBUG + + static void + prtok(t) + token t; + { + char *s; + + if (t < 0) + fprintf(stderr, "END"); + else if (t < NOTCHAR) + if (t & 0x80) + fprintf(stderr, "0x%02x", (unsigned char)t); + else + fprintf(stderr, "%c", t); + else + { + switch (t) + { + case EMPTY: s = "EMPTY"; break; + case BACKREF: s = "BACKREF"; break; + case BEGLINE: s = "BEGLINE"; break; + case ENDLINE: s = "ENDLINE"; break; + case BEGWORD: s = "BEGWORD"; break; + case ENDWORD: s = "ENDWORD"; break; + case LIMWORD: s = "LIMWORD"; break; + case NOTLIMWORD: s = "NOTLIMWORD"; break; + case QMARK: s = "QMARK"; break; + case STAR: s = "STAR"; break; + case PLUS: s = "PLUS"; break; + case CAT: s = "CAT"; break; + case OR: s = "OR"; break; + case ORTOP: s = "ORTOP"; break; + case LPAREN: s = "LPAREN"; break; + case RPAREN: s = "RPAREN"; break; + default: s = "CSET"; break; + } + fprintf(stderr, "%s", s); + } + } + #endif /* DEBUG */ + + /* Stuff pertaining to charclasses. */ + + static int + tstbit(b, c) + int b; + charclass c; + { + return c[b / INTBITS] & 1 << b % INTBITS; + } + + static void + setbit(b, c) + int b; + charclass c; + { + c[b / INTBITS] |= 1 << b % INTBITS; + } + + static void + clrbit(b, c) + int b; + charclass c; + { + c[b / INTBITS] &= ~(1 << b % INTBITS); + } + + static void + copyset(src, dst) + charclass src; + charclass dst; + { + int i; + + for (i = 0; i < CHARCLASS_INTS; ++i) + dst[i] = src[i]; + } + + static void + zeroset(s) + charclass s; + { + int i; + + for (i = 0; i < CHARCLASS_INTS; ++i) + s[i] = 0; + } + + static void + notset(s) + charclass s; + { + int i; + + for (i = 0; i < CHARCLASS_INTS; ++i) + s[i] = ~s[i]; + } + + static int + equal(s1, s2) + charclass s1; + charclass s2; + { + int i; + + for (i = 0; i < CHARCLASS_INTS; ++i) + if (s1[i] != s2[i]) + return 0; + return 1; + } + + static int + isemptyset(s) + charclass s; + { + int i; + + for (i = 0; i < CHARCLASS_INTS; i++) + if (s[i]) + return 0; + return 1; + } + + /* A pointer to the current dfa is kept here during parsing. */ + static struct dfa *dfa; + + /* Find the index of charclass s in dfa->charclasses, or allocate a new charclass. */ + static int + charclass_index(s) + charclass s; + { + int i; + + for (i = 0; i < dfa->cindex; ++i) + if (equal(s, dfa->charclasses[i])) + return i; + REALLOC_IF_NECESSARY(dfa->charclasses, charclass, dfa->calloc, dfa->cindex); + ++dfa->cindex; + copyset(s, dfa->charclasses[i]); + return i; + } + + /* Syntax bits controlling the behavior of the lexical analyzer. */ + static reg_syntax_t syntax_bits, syntax_bits_set; + + /* Flag for case-folding letters into sets. */ + static int case_fold; + + /* Entry point to set syntax options. */ + void + dfasyntax(bits, fold) + reg_syntax_t bits; + int fold; + { + syntax_bits_set = 1; + syntax_bits = bits; + case_fold = fold; + } + + /* Lexical analyzer. All the dross that deals with the obnoxious + GNU Regex syntax bits is located here. The poor, suffering + reader is referred to the GNU Regex documentation for the + meaning of the @#%!@#%^!@ syntax bits. */ + + static char *lexstart; /* Pointer to beginning of input string. */ + static char *lexptr; /* Pointer to next input character. */ + static lexleft; /* Number of characters remaining. */ + static token lasttok; /* Previous token returned; initially END. */ + static int laststart; /* True if we're separated from beginning or (, | + only by zero-width characters. */ + static int parens; /* Count of outstanding left parens. */ + static int minrep, maxrep; /* Repeat counts for {m,n}. */ + + static charclass cs_cset[8]; + static unsigned char cs_ready[8] = {0, 0, 0, 0, 0, 0, 0, 0}; + + static enum { + MBEXTTOK_NONE = -1, + MBEXTTOK_NOTCHAR = 256, + MBEXTTOK_ORMBC = MBEXTTOK_NOTCHAR, + MBEXTTOK_ORMBC_NL, + MBEXTTOK_CLASS, + MBEXTTOK_INVCLASS, + } mbexttok = MBEXTTOK_NONE; + + static charclass mbcset_set; + static charclass mbcset_all; + static charclass mbcset[128]; /* 128*256/8 = 4 Kbytes */ + + /* ÉÑÈˤ˻ÈÍѤµ¤ì¤ë (¤È»×¤ï¤ì¤ë) ʸ»ú½¸¹ç¤ò¥È¡¼¥¯¥ó¤È¤·¤ÆÊÖ¤¹. + n = 0 ... 1¥Ð¥¤¥Èʸ»úÁ´ÂΤν¸¹ç. + 1 ... 2¥Ð¥¤¥Èʸ»ú¤Î1¥Ð¥¤¥ÈÌÜÁ´ÂΤν¸¹ç. + 2 ... 2¥Ð¥¤¥Èʸ»ú¤Î2¥Ð¥¤¥ÈÌÜÁ´ÂΤν¸¹ç. + +4 ... '\n'¤ò½ü³°¤·¤Ê¤¤. */ + static token + setcodeset(n) + int n; + { + token c; + + if (!cs_ready[n]) { + zeroset(cs_cset[n]); + switch (n) { + case 0: + case 4: + /* 1¥Ð¥¤¥Èʸ»úÁ´ÂΤν¸¹ç. */ + for (c = 0; c < NOTCHAR; c++) + if (ismbchar(c)) + setbit(c, cs_cset[n]); + notset(cs_cset[n]); + break; + case 1: + case 5: + /* 2¥Ð¥¤¥Èʸ»ú¤Î1ʸ»úÌÜÁ´ÂΤν¸¹ç. */ + for (c = 0; c < NOTCHAR; c++) + if (ismbchar(c)) + setbit(c, cs_cset[n]); + break; + case 2: + case 6: + /* 2¥Ð¥¤¥Èʸ»ú¤Î2ʸ»úÌÜÁ´ÂΤν¸¹ç. */ + notset(cs_cset[n]); + break; + } + if (!(n & 4)) { + if (syntax_bits & RE_DOT_NOT_NULL || n != 0) + clrbit('\0', cs_cset[n]); + if (!(syntax_bits & RE_DOT_NEWLINE) || n != 0) + clrbit('\n', cs_cset[n]); + } + cs_ready[n] = 1; + } + return CSET + charclass_index(cs_cset[n]); + } + + /* Note that characters become unsigned here. */ + #define FETCH(c, eoferr) \ + { \ + if (! lexleft) \ + if (eoferr != 0) \ + fprintf(stderr,eoferr); \ + else \ + return lasttok = END; \ + (c) = (unsigned char) *lexptr++; \ + --lexleft; \ + } + + #ifdef __STDC__ + #define FUNC(F, P) static int F(int c) { return P(c); } + #else + #define FUNC(F, P) static int F(c) int c; { return P(c); } + #endif + + FUNC(is_alpha, ISALPHA) + FUNC(is_upper, ISUPPER) + FUNC(is_lower, ISLOWER) + FUNC(is_digit, ISDIGIT) + FUNC(is_xdigit, ISXDIGIT) + FUNC(is_space, ISSPACE) + FUNC(is_punct, ISPUNCT) + FUNC(is_alnum, ISALNUM) + FUNC(is_print, ISPRINT) + FUNC(is_graph, ISGRAPH) + FUNC(is_cntrl, ISCNTRL) + + /* The following list maps the names of the Posix named character classes + to predicate functions that determine whether a given character is in + the class. The leading [ has already been eaten by the lexical analyzer. */ + static struct { + const char *name; + int (*pred) _RE_ARGS((int)); + } prednames[] = { + { ":alpha:]", is_alpha }, + { ":upper:]", is_upper }, + { ":lower:]", is_lower }, + { ":digit:]", is_digit }, + { ":xdigit:]", is_xdigit }, + { ":space:]", is_space }, + { ":punct:]", is_punct }, + { ":alnum:]", is_alnum }, + { ":print:]", is_print }, + { ":graph:]", is_graph }, + { ":cntrl:]", is_cntrl }, + { 0 } + }; + + static int + looking_at(s) + const char *s; + { + size_t len; + + len = strlen(s); + if (lexleft < len) + return 0; + return strncmp(s, lexptr, len) == 0; + } + + static token + lex() + { + token c, c1, c2; + int backslash = 0, invert; + charclass ccl; + int i; + + /* Basic plan: We fetch a character. If it's a backslash, + we set the backslash flag and go through the loop again. + On the plus side, this avoids having a duplicate of the + main switch inside the backslash case. On the minus side, + it means that just about every case begins with + "if (backslash) ...". */ + mbexttok = MBEXTTOK_NONE; + for (i = 0; i < 2; ++i) + { + FETCH(c, 0); + switch (c) + { + case '\\': + if (backslash) + goto normal_char; + if (lexleft == 0) + fprintf(stderr,"Unfinished \\ escape"); + backslash = 1; + break; + + case '^': + if (backslash) + goto normal_char; + if (syntax_bits & RE_CONTEXT_INDEP_ANCHORS + || lasttok == END + || lasttok == LPAREN + || lasttok == OR) + return lasttok = BEGLINE; + goto normal_char; + + case '$': + if (backslash) + goto normal_char; + if (syntax_bits & RE_CONTEXT_INDEP_ANCHORS + || lexleft == 0 + || (syntax_bits & RE_NO_BK_PARENS + ? lexleft > 0 && *lexptr == ')' + : lexleft > 1 && lexptr[0] == '\\' && lexptr[1] == ')') + || (syntax_bits & RE_NO_BK_VBAR + ? lexleft > 0 && *lexptr == '|' + : lexleft > 1 && lexptr[0] == '\\' && lexptr[1] == '|') + || ((syntax_bits & RE_NEWLINE_ALT) + && lexleft > 0 && *lexptr == '\n')) + return lasttok = ENDLINE; + goto normal_char; + + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + if (backslash && !(syntax_bits & RE_NO_BK_REFS)) + { + laststart = 0; + return lasttok = BACKREF; + } + goto normal_char; + + case '<': + if (syntax_bits & RE_NO_GNU_OPS) + goto normal_char; + if (backslash) + return lasttok = BEGWORD; + goto normal_char; + + case '>': + if (syntax_bits & RE_NO_GNU_OPS) + goto normal_char; + if (backslash) + return lasttok = ENDWORD; + goto normal_char; + + case 'b': + if (syntax_bits & RE_NO_GNU_OPS) + goto normal_char; + if (backslash) + return lasttok = LIMWORD; + goto normal_char; + + case 'B': + if (syntax_bits & RE_NO_GNU_OPS) + goto normal_char; + if (backslash) + return lasttok = NOTLIMWORD; + goto normal_char; + + case '?': + if (syntax_bits & RE_LIMITED_OPS) + goto normal_char; + if (backslash != ((syntax_bits & RE_BK_PLUS_QM) != 0)) + goto normal_char; + if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart) + goto normal_char; + return lasttok = QMARK; + + case '*': + if (backslash) + goto normal_char; + if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart) + goto normal_char; + return lasttok = STAR; + + case '+': + if (syntax_bits & RE_LIMITED_OPS) + goto normal_char; + if (backslash != ((syntax_bits & RE_BK_PLUS_QM) != 0)) + goto normal_char; + if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart) + goto normal_char; + return lasttok = PLUS; + + case '{': + if (!(syntax_bits & RE_INTERVALS)) + goto normal_char; + if (backslash != ((syntax_bits & RE_NO_BK_BRACES) == 0)) + goto normal_char; + minrep = maxrep = 0; + /* Cases: + {M} - exact count + {M,} - minimum count, maximum is infinity + {,M} - 0 through M + {M,N} - M through N */ + FETCH(c, "unfinished repeat count"); + if (ISDIGIT(c)) + { + minrep = c - '0'; + for (;;) + { + FETCH(c, "unfinished repeat count"); + if (!ISDIGIT(c)) + break; + minrep = 10 * minrep + c - '0'; + } + } + else if (c != ',') + fprintf(stderr,"malformed repeat count"); + if (c == ',') + for (;;) + { + FETCH(c, "unfinished repeat count"); + if (!ISDIGIT(c)) + break; + maxrep = 10 * maxrep + c - '0'; + } + else + maxrep = minrep; + if (!(syntax_bits & RE_NO_BK_BRACES)) + { + if (c != '\\') + fprintf(stderr,"malformed repeat count"); + FETCH(c, "unfinished repeat count"); + } + if (c != '}') + fprintf(stderr,"malformed repeat count"); + laststart = 0; + return lasttok = REPMN; + + case '|': + if (syntax_bits & RE_LIMITED_OPS) + goto normal_char; + if (backslash != ((syntax_bits & RE_NO_BK_VBAR) == 0)) + goto normal_char; + laststart = 1; + return lasttok = OR; + + case '\n': + if (syntax_bits & RE_LIMITED_OPS + || backslash + || !(syntax_bits & RE_NEWLINE_ALT)) + goto normal_char; + laststart = 1; + return lasttok = OR; + + case '(': + if (backslash != ((syntax_bits & RE_NO_BK_PARENS) == 0)) + goto normal_char; + ++parens; + laststart = 1; + return lasttok = LPAREN; + + case ')': + if (backslash != ((syntax_bits & RE_NO_BK_PARENS) == 0)) + goto normal_char; + if (parens == 0 && syntax_bits & RE_UNMATCHED_RIGHT_PAREN_ORD) + goto normal_char; + --parens; + laststart = 0; + return lasttok = RPAREN; + + case '.': + if (backslash) + goto normal_char; + if (current_mbctype != MBCTYPE_ASCII) + mbexttok = MBEXTTOK_ORMBC; + laststart = 0; + return lasttok = setcodeset(0); + + case 'w': + if (!backslash || (syntax_bits & RE_NO_GNU_OPS)) + goto normal_char; + zeroset(ccl); + for (c2 = 0; c2 < NOTCHAR; ++c2) + if (ISALNUM(c2)) + setbit(c2, ccl); + laststart = 0; + return lasttok = CSET + charclass_index(ccl); + + case 'W': + if (!backslash || (syntax_bits & RE_NO_GNU_OPS)) + goto normal_char; + zeroset(ccl); + for (c2 = 0; c2 < NOTCHAR; ++c2) + if (!ISALNUM(c2) && !ismbchar(c2)) + setbit(c2, ccl); + mbexttok = MBEXTTOK_ORMBC_NL; + laststart = 0; + return lasttok = CSET + charclass_index(ccl); + + case '[': + if (backslash) + goto normal_char; + zeroset(ccl); + FETCH(c, "Unbalanced ["); + if (c == '^') + { + FETCH(c, "Unbalanced ["); + invert = 1; + } + else + invert = 0; + do + { + unsigned char ch = 0, c2h = 0; + + /* Nobody ever said this had to be fast. :-) + Note that if we're looking at some other [:...:] + construct, we just treat it as a bunch of ordinary + characters. We can do this because we assume + regex has checked for syntax errors before + dfa is ever called. */ + if (c == '[' && (syntax_bits & RE_CHAR_CLASSES)) + for (c1 = 0; prednames[c1].name; ++c1) + if (looking_at(prednames[c1].name)) + { + for (c2 = 0; c2 < NOTCHAR; ++c2) + if ((*prednames[c1].pred)(c2)) + setbit(c2, ccl); + lexptr += strlen(prednames[c1].name); + lexleft -= strlen(prednames[c1].name); + FETCH(c1, "Unbalanced ["); + goto skip; + } + if (c == '\\' && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS)) + FETCH(c, "Unbalanced ["); + if (ismbchar(c)) { + ch = (unsigned char)c; + FETCH(c, "Multi-byte char incomplete"); + } + FETCH(c1, "Unbalanced ["); + if (c1 == '-') + { + FETCH(c2, "Unbalanced ["); + if (c2 == ']') + { + /* In the case [x-], the - is an ordinary hyphen, + which is left in c1, the lookahead character. */ + --lexptr; + ++lexleft; + c2 = c; + } + else + { + if (c2 == '\\' + && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS)) + FETCH(c2, "Unbalanced ["); + if (ismbchar(c2)) { + c2h = (unsigned char)c2; + FETCH(c2, "Multi-byte char incomplete"); + } + FETCH(c1, "Unbalanced ["); + } + } + else { + c2h = ch; + c2 = c; + } + if (ch < c2h || (ch == c2h && c <= c2)) { + if (ch == 0) { + ch = (unsigned char)c2; + if (c2h > 0) + ch = NOTCHAR - 1; + for (; (unsigned char)c <= ch; c++) { + setbit(c, ccl); + if (case_fold) { + if (ISUPPER(c)) + setbit(tolower(c), ccl); + else if (ISLOWER(c)) + setbit(toupper(c), ccl); + } + } + ch = 0x80; + c = 0x00; + } + if (ch <= c2h) { + if (mbexttok < 0) { + mbexttok = MBEXTTOK_CLASS; + zeroset(mbcset_set); + zeroset(mbcset_all); + } + if (ch < c2h && c != 0x00) { /* ºÇ½é¤ÎȾü */ + int t; + + if (ismbchar(ch) + && ((t = tstbit(ch, mbcset_set)) + || !tstbit(ch, mbcset_all))) { + if (!t) { + setbit(ch, mbcset_set); + zeroset(mbcset[ch & 0177]); + } + for (; c < NOTCHAR; c++) + setbit(c, mbcset[ch & 0177]); + } + ch++; + c = 0x00; + } + if (ch < c2h || (ch == c2h && c == 0x00 && c2 == 0xff)) { + if (c == 0x00 && c2 == 0xff) + c2h++; + for (; ch < c2h; ch++) + if (ismbchar(ch)) { + clrbit(ch, mbcset_set); + setbit(ch, mbcset_all); + } + if (c == 0x00 && c2 == 0xff) + c2h--; + c = 0x00; + } + if (ch <= c2h) { + int t; + + /* ¤³¤³¤Ç¤Ïɬ¤º c <= c2 ¤È¤Ê¤Ã¤Æ¤¤¤ë. */ + if (ismbchar(ch) + && ((t = tstbit(ch, mbcset_set)) + || !tstbit(ch, mbcset_all))) { + if (!t) { + setbit(ch, mbcset_set); + zeroset(mbcset[ch & 0177]); + } + for (; c <= c2; c++) + setbit(c, mbcset[ch & 0177]); + } + } + } + } + skip: + ; + } + while ((c = c1) != ']'); + if (invert) + { + notset(ccl); + if (syntax_bits & RE_HAT_LISTS_NOT_NEWLINE) + clrbit('\n', ccl); + if (mbexttok == MBEXTTOK_CLASS) { + mbexttok = MBEXTTOK_INVCLASS; + if (!isemptyset(mbcset_set)) { + for (c = 0x80; c <= 0xff; c++) + if (tstbit(c, mbcset_set)) + notset(mbcset[c & 0177]); + } + notset(mbcset_all); + } + else + mbexttok = MBEXTTOK_ORMBC_NL; + } + if (current_mbctype != MBCTYPE_ASCII) + for (c = 0x80; c <= 0xff; c++) + if (ismbchar(c)) + clrbit(c, ccl); + laststart = 0; + return lasttok = CSET + charclass_index(ccl); + + default: + normal_char: + laststart = 0; + if (ismbchar(c)) { + FETCH(mbexttok, "Multi-byte char incomplete"); + return c; + } + if (case_fold && ISALPHA(c)) + { + zeroset(ccl); + setbit(c, ccl); + if (isupper(c)) + setbit(tolower(c), ccl); + else + setbit(toupper(c), ccl); + return lasttok = CSET + charclass_index(ccl); + } + return c; + } + } + + /* The above loop should consume at most a backslash + and some other character. */ + abort(); + } + + /* Recursive descent parser for regular expressions. */ + + static token tok; /* Lookahead token. */ + static depth; /* Current depth of a hypothetical stack + holding deferred productions. This is + used to determine the depth that will be + required of the real stack later on in + dfaanalyze(). */ + + /* Add the given token to the parse tree, maintaining the depth count and + updating the maximum depth if necessary. */ + static void + addtok(t) + token t; + { + REALLOC_IF_NECESSARY(dfa->tokens, token, dfa->talloc, dfa->tindex); + dfa->tokens[dfa->tindex++] = t; + + switch (t) + { + case QMARK: + case STAR: + case PLUS: + break; + + case CAT: + case OR: + case ORTOP: + --depth; + break; + + default: + ++dfa->nleaves; + case EMPTY: + ++depth; + break; + } + if (depth > dfa->depth) + dfa->depth = depth; + } + + /* The grammar understood by the parser is as follows. + + regexp: + regexp OR branch + branch + + branch: + branch closure + closure + + closure: + closure QMARK + closure STAR + closure PLUS + atom + + atom: + + CSET + BACKREF + BEGLINE + ENDLINE + BEGWORD + ENDWORD + LIMWORD + NOTLIMWORD + + + The parser builds a parse tree in postfix form in an array of tokens. */ + + static void + atom() + { + if (mbexttok >= 0) { + if (mbexttok < MBEXTTOK_NOTCHAR) { + addtok(tok); + addtok(mbexttok); + addtok(CAT); + } + else + switch (mbexttok) { + case MBEXTTOK_ORMBC: + case MBEXTTOK_ORMBC_NL: + addtok(tok); + if (mbexttok == MBEXTTOK_ORMBC) { + addtok(setcodeset(1)); + addtok(setcodeset(2)); + } + else { + addtok(setcodeset(5)); + addtok(setcodeset(6)); + } + addtok(CAT); + addtok(OR); + break; + case MBEXTTOK_CLASS: + case MBEXTTOK_INVCLASS: + { + token c; + + addtok(tok); + if (!isemptyset(mbcset_set)) + for (c = 0x80; c <= 0xff; c++) + if (tstbit(c, mbcset_set)) { + /* Make sure all bits in mbcset_all valid. */ + clrbit(c, mbcset_all); + addtok(c); + if (mbexttok == MBEXTTOK_CLASS) { + clrbit('\n', mbcset[c & 0177]); + clrbit('\0', mbcset[c & 0177]); + } + else { + setbit('\n', mbcset[c & 0177]); + setbit('\0', mbcset[c & 0177]); + } + addtok(CSET + charclass_index(mbcset[c & 0177])); + addtok(CAT); + addtok(OR); + } + if (!isemptyset(mbcset_all)) { + addtok(CSET + charclass_index(mbcset_all)); + if (mbexttok == MBEXTTOK_CLASS) + addtok(setcodeset(2)); + else + addtok(setcodeset(6)); + addtok(CAT); + addtok(OR); + } + } + break; + default: + break; + } + tok = lex(); + } else + if ((tok >= 0 && tok < NOTCHAR) || tok >= CSET || tok == BACKREF + || tok == BEGLINE || tok == ENDLINE || tok == BEGWORD + || tok == ENDWORD || tok == LIMWORD || tok == NOTLIMWORD) + { + addtok(tok); + tok = lex(); + } + else if (tok == LPAREN) + { + tok = lex(); + regexp(0); + if (tok != RPAREN) + fprintf(stderr,"Unbalanced ("); + tok = lex(); + } + else + addtok(EMPTY); + } + + /* Return the number of tokens in the given subexpression. */ + static int + nsubtoks(tindex) + int tindex; + { + int ntoks1; + + switch (dfa->tokens[tindex - 1]) + { + default: + return 1; + case QMARK: + case STAR: + case PLUS: + return 1 + nsubtoks(tindex - 1); + case CAT: + case OR: + case ORTOP: + ntoks1 = nsubtoks(tindex - 1); + return 1 + ntoks1 + nsubtoks(tindex - 1 - ntoks1); + } + } + + /* Copy the given subexpression to the top of the tree. */ + static void + copytoks(tindex, ntokens) + int tindex, ntokens; + { + int i; + + for (i = 0; i < ntokens; ++i) + addtok(dfa->tokens[tindex + i]); + } + + static void + closure() + { + int tindex, ntokens, i; + + atom(); + while (tok == QMARK || tok == STAR || tok == PLUS || tok == REPMN) + if (tok == REPMN) + { + ntokens = nsubtoks(dfa->tindex); + tindex = dfa->tindex - ntokens; + if (maxrep == 0) + addtok(PLUS); + if (minrep == 0) + addtok(QMARK); + for (i = 1; i < minrep; ++i) + { + copytoks(tindex, ntokens); + addtok(CAT); + } + for (; i < maxrep; ++i) + { + copytoks(tindex, ntokens); + addtok(QMARK); + addtok(CAT); + } + tok = lex(); + } + else + { + addtok(tok); + tok = lex(); + } + } + + static void + branch() + { + closure(); + while (tok != RPAREN && tok != OR && tok >= 0) + { + closure(); + addtok(CAT); + } + } + + static void + regexp(toplevel) + int toplevel; + { + branch(); + while (tok == OR) + { + tok = lex(); + branch(); + if (toplevel) + addtok(ORTOP); + else + addtok(OR); + } + } + + /* Main entry point for the parser. S is a string to be parsed, len is the + length of the string, so s can include NUL characters. D is a pointer to + the struct dfa to parse into. */ + void + dfaparse(s, len, d) + char *s; + size_t len; + struct dfa *d; + + { + dfa = d; + lexstart = lexptr = s; + lexleft = len; + lasttok = END; + laststart = 1; + parens = 0; + + if (! syntax_bits_set) + fprintf(stderr,"No syntax specified"); + + tok = lex(); + depth = d->depth; + + regexp(1); + + if (tok != END) + fprintf(stderr,"Unbalanced )"); + + addtok(END - d->nregexps); + addtok(CAT); + + if (d->nregexps) + addtok(ORTOP); + + ++d->nregexps; + } + + /* Some primitives for operating on sets of positions. */ + + /* Copy one set to another; the destination must be large enough. */ + static void + copy(src, dst) + position_set *src; + position_set *dst; + { + int i; + + for (i = 0; i < src->nelem; ++i) + dst->elems[i] = src->elems[i]; + dst->nelem = src->nelem; + } + + /* Insert a position in a set. Position sets are maintained in sorted + order according to index. If position already exists in the set with + the same index then their constraints are logically or'd together. + S->elems must point to an array large enough to hold the resulting set. */ + static void + insert(p, s) + position p; + position_set *s; + { + int i; + position t1, t2; + + for (i = 0; i < s->nelem && p.index < s->elems[i].index; ++i) + ; + if (i < s->nelem && p.index == s->elems[i].index) + s->elems[i].constraint |= p.constraint; + else + { + t1 = p; + ++s->nelem; + while (i < s->nelem) + { + t2 = s->elems[i]; + s->elems[i++] = t1; + t1 = t2; + } + } + } + + /* Merge two sets of positions into a third. The result is exactly as if + the positions of both sets were inserted into an initially empty set. */ + static void + merge(s1, s2, m) + position_set *s1; + position_set *s2; + position_set *m; + { + int i = 0, j = 0; + + m->nelem = 0; + while (i < s1->nelem && j < s2->nelem) + if (s1->elems[i].index > s2->elems[j].index) + m->elems[m->nelem++] = s1->elems[i++]; + else if (s1->elems[i].index < s2->elems[j].index) + m->elems[m->nelem++] = s2->elems[j++]; + else + { + m->elems[m->nelem] = s1->elems[i++]; + m->elems[m->nelem++].constraint |= s2->elems[j++].constraint; + } + while (i < s1->nelem) + m->elems[m->nelem++] = s1->elems[i++]; + while (j < s2->nelem) + m->elems[m->nelem++] = s2->elems[j++]; + } + + /* Delete a position from a set. */ + static void + delete(p, s) + position p; + position_set *s; + { + int i; + + for (i = 0; i < s->nelem; ++i) + if (p.index == s->elems[i].index) + break; + if (i < s->nelem) + for (--s->nelem; i < s->nelem; ++i) + s->elems[i] = s->elems[i + 1]; + } + + /* Find the index of the state corresponding to the given position set with + the given preceding context, or create a new state if there is no such + state. Newline and letter tell whether we got here on a newline or + letter, respectively. */ + static int + state_index(d, s, newline, letter) + struct dfa *d; + position_set *s; + int newline; + int letter; + { + int hash = 0; + int constraint; + int i, j; + + newline = newline ? 1 : 0; + letter = letter ? 1 : 0; + + for (i = 0; i < s->nelem; ++i) + hash ^= s->elems[i].index + s->elems[i].constraint; + + /* Try to find a state that exactly matches the proposed one. */ + for (i = 0; i < d->sindex; ++i) + { + if (hash != d->states[i].hash || s->nelem != d->states[i].elems.nelem + || newline != d->states[i].newline || letter != d->states[i].letter) + continue; + for (j = 0; j < s->nelem; ++j) + if (s->elems[j].constraint + != d->states[i].elems.elems[j].constraint + || s->elems[j].index != d->states[i].elems.elems[j].index) + break; + if (j == s->nelem) + return i; + } + + /* We'll have to create a new state. */ + REALLOC_IF_NECESSARY(d->states, dfa_state, d->salloc, d->sindex); + d->states[i].hash = hash; + MALLOC(d->states[i].elems.elems, position, s->nelem); + copy(s, &d->states[i].elems); + d->states[i].newline = newline; + d->states[i].letter = letter; + d->states[i].backref = 0; + d->states[i].constraint = 0; + d->states[i].first_end = 0; + for (j = 0; j < s->nelem; ++j) + if (d->tokens[s->elems[j].index] < 0) + { + constraint = s->elems[j].constraint; + if (SUCCEEDS_IN_CONTEXT(constraint, newline, 0, letter, 0) + || SUCCEEDS_IN_CONTEXT(constraint, newline, 0, letter, 1) + || SUCCEEDS_IN_CONTEXT(constraint, newline, 1, letter, 0) + || SUCCEEDS_IN_CONTEXT(constraint, newline, 1, letter, 1)) + d->states[i].constraint |= constraint; + if (! d->states[i].first_end) + d->states[i].first_end = d->tokens[s->elems[j].index]; + } + else if (d->tokens[s->elems[j].index] == BACKREF) + { + d->states[i].constraint = NO_CONSTRAINT; + d->states[i].backref = 1; + } + + ++d->sindex; + + return i; + } + + /* Find the epsilon closure of a set of positions. If any position of the set + contains a symbol that matches the empty string in some context, replace + that position with the elements of its follow labeled with an appropriate + constraint. Repeat exhaustively until no funny positions are left. + S->elems must be large enough to hold the result. */ + static void epsclosure _RE_ARGS((position_set *s, struct dfa *d)); + + static void + epsclosure(s, d) + position_set *s; + struct dfa *d; + { + int i, j; + int *visited; + position p, old; + + MALLOC(visited, int, d->tindex); + for (i = 0; i < d->tindex; ++i) + visited[i] = 0; + + for (i = 0; i < s->nelem; ++i) + if (d->tokens[s->elems[i].index] >= NOTCHAR + && d->tokens[s->elems[i].index] != BACKREF + && d->tokens[s->elems[i].index] < CSET) + { + old = s->elems[i]; + p.constraint = old.constraint; + delete(s->elems[i], s); + if (visited[old.index]) + { + --i; + continue; + } + visited[old.index] = 1; + switch (d->tokens[old.index]) + { + case BEGLINE: + p.constraint &= BEGLINE_CONSTRAINT; + break; + case ENDLINE: + p.constraint &= ENDLINE_CONSTRAINT; + break; + case BEGWORD: + p.constraint &= BEGWORD_CONSTRAINT; + break; + case ENDWORD: + p.constraint &= ENDWORD_CONSTRAINT; + break; + case LIMWORD: + p.constraint &= LIMWORD_CONSTRAINT; + break; + case NOTLIMWORD: + p.constraint &= NOTLIMWORD_CONSTRAINT; + break; + default: + break; + } + for (j = 0; j < d->follows[old.index].nelem; ++j) + { + p.index = d->follows[old.index].elems[j].index; + insert(p, s); + } + /* Force rescan to start at the beginning. */ + i = -1; + } + + free(visited); + } + + /* Perform bottom-up analysis on the parse tree, computing various functions. + Note that at this point, we're pretending constructs like \< are real + characters rather than constraints on what can follow them. + + Nullable: A node is nullable if it is at the root of a regexp that can + match the empty string. + * EMPTY leaves are nullable. + * No other leaf is nullable. + * A QMARK or STAR node is nullable. + * A PLUS node is nullable if its argument is nullable. + * A CAT node is nullable if both its arguments are nullable. + * An OR node is nullable if either argument is nullable. + + Firstpos: The firstpos of a node is the set of positions (nonempty leaves) + that could correspond to the first character of a string matching the + regexp rooted at the given node. + * EMPTY leaves have empty firstpos. + * The firstpos of a nonempty leaf is that leaf itself. + * The firstpos of a QMARK, STAR, or PLUS node is the firstpos of its + argument. + * The firstpos of a CAT node is the firstpos of the left argument, union + the firstpos of the right if the left argument is nullable. + * The firstpos of an OR node is the union of firstpos of each argument. + + Lastpos: The lastpos of a node is the set of positions that could + correspond to the last character of a string matching the regexp at + the given node. + * EMPTY leaves have empty lastpos. + * The lastpos of a nonempty leaf is that leaf itself. + * The lastpos of a QMARK, STAR, or PLUS node is the lastpos of its + argument. + * The lastpos of a CAT node is the lastpos of its right argument, union + the lastpos of the left if the right argument is nullable. + * The lastpos of an OR node is the union of the lastpos of each argument. + + Follow: The follow of a position is the set of positions that could + correspond to the character following a character matching the node in + a string matching the regexp. At this point we consider special symbols + that match the empty string in some context to be just normal characters. + Later, if we find that a special symbol is in a follow set, we will + replace it with the elements of its follow, labeled with an appropriate + constraint. + * Every node in the firstpos of the argument of a STAR or PLUS node is in + the follow of every node in the lastpos. + * Every node in the firstpos of the second argument of a CAT node is in + the follow of every node in the lastpos of the first argument. + + Because of the postfix representation of the parse tree, the depth-first + analysis is conveniently done by a linear scan with the aid of a stack. + Sets are stored as arrays of the elements, obeying a stack-like allocation + scheme; the number of elements in each set deeper in the stack can be + used to determine the address of a particular set's array. */ + void + dfaanalyze(d, searchflag) + struct dfa *d; + int searchflag; + { + int *nullable; /* Nullable stack. */ + int *nfirstpos; /* Element count stack for firstpos sets. */ + position *firstpos; /* Array where firstpos elements are stored. */ + int *nlastpos; /* Element count stack for lastpos sets. */ + position *lastpos; /* Array where lastpos elements are stored. */ + int *nalloc; /* Sizes of arrays allocated to follow sets. */ + position_set tmp; /* Temporary set for merging sets. */ + position_set merged; /* Result of merging sets. */ + int wants_newline; /* True if some position wants newline info. */ + int *o_nullable; + int *o_nfirst, *o_nlast; + position *o_firstpos, *o_lastpos; + int i, j; + position *pos; + + #ifdef DEBUG + fprintf(stderr, "dfaanalyze:\n"); + for (i = 0; i < d->tindex; ++i) + { + fprintf(stderr, " %d:", i); + prtok(d->tokens[i]); + } + putc('\n', stderr); + #endif + + d->searchflag = searchflag; + + MALLOC(nullable, int, d->depth); + o_nullable = nullable; + MALLOC(nfirstpos, int, d->depth); + o_nfirst = nfirstpos; + MALLOC(firstpos, position, d->nleaves); + o_firstpos = firstpos, firstpos += d->nleaves; + MALLOC(nlastpos, int, d->depth); + o_nlast = nlastpos; + MALLOC(lastpos, position, d->nleaves); + o_lastpos = lastpos, lastpos += d->nleaves; + MALLOC(nalloc, int, d->tindex); + for (i = 0; i < d->tindex; ++i) + nalloc[i] = 0; + MALLOC(merged.elems, position, d->nleaves); + + CALLOC(d->follows, position_set, d->tindex); + + for (i = 0; i < d->tindex; ++i) + #ifdef DEBUG + { /* Nonsyntactic #ifdef goo... */ + #endif + switch (d->tokens[i]) + { + case EMPTY: + /* The empty set is nullable. */ + *nullable++ = 1; + + /* The firstpos and lastpos of the empty leaf are both empty. */ + *nfirstpos++ = *nlastpos++ = 0; + break; + + case STAR: + case PLUS: + /* Every element in the firstpos of the argument is in the follow + of every element in the lastpos. */ + tmp.nelem = nfirstpos[-1]; + tmp.elems = firstpos; + pos = lastpos; + for (j = 0; j < nlastpos[-1]; ++j) + { + merge(&tmp, &d->follows[pos[j].index], &merged); + REALLOC_IF_NECESSARY(d->follows[pos[j].index].elems, position, + nalloc[pos[j].index], merged.nelem - 1); + copy(&merged, &d->follows[pos[j].index]); + } + + case QMARK: + /* A QMARK or STAR node is automatically nullable. */ + if (d->tokens[i] != PLUS) + nullable[-1] = 1; + break; + + case CAT: + /* Every element in the firstpos of the second argument is in the + follow of every element in the lastpos of the first argument. */ + tmp.nelem = nfirstpos[-1]; + tmp.elems = firstpos; + pos = lastpos + nlastpos[-1]; + for (j = 0; j < nlastpos[-2]; ++j) + { + merge(&tmp, &d->follows[pos[j].index], &merged); + REALLOC_IF_NECESSARY(d->follows[pos[j].index].elems, position, + nalloc[pos[j].index], merged.nelem - 1); + copy(&merged, &d->follows[pos[j].index]); + } + + /* The firstpos of a CAT node is the firstpos of the first argument, + union that of the second argument if the first is nullable. */ + if (nullable[-2]) + nfirstpos[-2] += nfirstpos[-1]; + else + firstpos += nfirstpos[-1]; + --nfirstpos; + + /* The lastpos of a CAT node is the lastpos of the second argument, + union that of the first argument if the second is nullable. */ + if (nullable[-1]) + nlastpos[-2] += nlastpos[-1]; + else + { + pos = lastpos + nlastpos[-2]; + for (j = nlastpos[-1] - 1; j >= 0; --j) + pos[j] = lastpos[j]; + lastpos += nlastpos[-2]; + nlastpos[-2] = nlastpos[-1]; + } + --nlastpos; + + /* A CAT node is nullable if both arguments are nullable. */ + nullable[-2] = nullable[-1] && nullable[-2]; + --nullable; + break; + + case OR: + case ORTOP: + /* The firstpos is the union of the firstpos of each argument. */ + nfirstpos[-2] += nfirstpos[-1]; + --nfirstpos; + + /* The lastpos is the union of the lastpos of each argument. */ + nlastpos[-2] += nlastpos[-1]; + --nlastpos; + + /* An OR node is nullable if either argument is nullable. */ + nullable[-2] = nullable[-1] || nullable[-2]; + --nullable; + break; + + default: + /* Anything else is a nonempty position. (Note that special + constructs like \< are treated as nonempty strings here; + an "epsilon closure" effectively makes them nullable later. + Backreferences have to get a real position so we can detect + transitions on them later. But they are nullable. */ + *nullable++ = d->tokens[i] == BACKREF; + + /* This position is in its own firstpos and lastpos. */ + *nfirstpos++ = *nlastpos++ = 1; + --firstpos, --lastpos; + firstpos->index = lastpos->index = i; + firstpos->constraint = lastpos->constraint = NO_CONSTRAINT; + + /* Allocate the follow set for this position. */ + nalloc[i] = 1; + MALLOC(d->follows[i].elems, position, nalloc[i]); + break; + } + #ifdef DEBUG + /* ... balance the above nonsyntactic #ifdef goo... */ + fprintf(stderr, "node %d:", i); + prtok(d->tokens[i]); + putc('\n', stderr); + fprintf(stderr, nullable[-1] ? " nullable: yes\n" : " nullable: no\n"); + fprintf(stderr, " firstpos:"); + for (j = nfirstpos[-1] - 1; j >= 0; --j) + { + fprintf(stderr, " %d:", firstpos[j].index); + prtok(d->tokens[firstpos[j].index]); + } + fprintf(stderr, "\n lastpos:"); + for (j = nlastpos[-1] - 1; j >= 0; --j) + { + fprintf(stderr, " %d:", lastpos[j].index); + prtok(d->tokens[lastpos[j].index]); + } + putc('\n', stderr); + } + #endif + + /* For each follow set that is the follow set of a real position, replace + it with its epsilon closure. */ + for (i = 0; i < d->tindex; ++i) + if (d->tokens[i] < NOTCHAR || d->tokens[i] == BACKREF + || d->tokens[i] >= CSET) + { + #ifdef DEBUG + fprintf(stderr, "follows(%d:", i); + prtok(d->tokens[i]); + fprintf(stderr, "):"); + for (j = d->follows[i].nelem - 1; j >= 0; --j) + { + fprintf(stderr, " %d:", d->follows[i].elems[j].index); + prtok(d->tokens[d->follows[i].elems[j].index]); + } + putc('\n', stderr); + #endif + copy(&d->follows[i], &merged); + epsclosure(&merged, d); + if (d->follows[i].nelem < merged.nelem) + REALLOC(d->follows[i].elems, position, merged.nelem); + copy(&merged, &d->follows[i]); + } + + /* Get the epsilon closure of the firstpos of the regexp. The result will + be the set of positions of state 0. */ + merged.nelem = 0; + for (i = 0; i < nfirstpos[-1]; ++i) + insert(firstpos[i], &merged); + epsclosure(&merged, d); + + /* Check if any of the positions of state 0 will want newline context. */ + wants_newline = 0; + for (i = 0; i < merged.nelem; ++i) + if (PREV_NEWLINE_DEPENDENT(merged.elems[i].constraint)) + wants_newline = 1; + + /* Build the initial state. */ + d->salloc = 1; + d->sindex = 0; + MALLOC(d->states, dfa_state, d->salloc); + state_index(d, &merged, wants_newline, 0); + + free(o_nullable); + free(o_nfirst); + free(o_firstpos); + free(o_nlast); + free(o_lastpos); + free(nalloc); + free(merged.elems); + } + + /* Find, for each character, the transition out of state s of d, and store + it in the appropriate slot of trans. + + We divide the positions of s into groups (positions can appear in more + than one group). Each group is labeled with a set of characters that + every position in the group matches (taking into account, if necessary, + preceding context information of s). For each group, find the union + of the its elements' follows. This set is the set of positions of the + new state. For each character in the group's label, set the transition + on this character to be to a state corresponding to the set's positions, + and its associated backward context information, if necessary. + + If we are building a searching matcher, we include the positions of state + 0 in every state. + + The collection of groups is constructed by building an equivalence-class + partition of the positions of s. + + For each position, find the set of characters C that it matches. Eliminate + any characters from C that fail on grounds of backward context. + + Search through the groups, looking for a group whose label L has nonempty + intersection with C. If L - C is nonempty, create a new group labeled + L - C and having the same positions as the current group, and set L to + the intersection of L and C. Insert the position in this group, set + C = C - L, and resume scanning. + + If after comparing with every group there are characters remaining in C, + create a new group labeled with the characters of C and insert this + position in that group. */ + void + dfastate(s, d, trans) + int s; + struct dfa *d; + int trans[]; + { + position_set grps[NOTCHAR]; /* As many as will ever be needed. */ + charclass labels[NOTCHAR]; /* Labels corresponding to the groups. */ + int ngrps = 0; /* Number of groups actually used. */ + position pos; /* Current position being considered. */ + charclass matches; /* Set of matching characters. */ + int matchesf; /* True if matches is nonempty. */ + charclass intersect; /* Intersection with some label set. */ + int intersectf; /* True if intersect is nonempty. */ + charclass leftovers; /* Stuff in the label that didn't match. */ + int leftoversf; /* True if leftovers is nonempty. */ + static charclass letters; /* Set of characters considered letters. */ + static charclass newline; /* Set of characters that aren't newline. */ + position_set follows; /* Union of the follows of some group. */ + position_set tmp; /* Temporary space for merging sets. */ + int state; /* New state. */ + int wants_newline; /* New state wants to know newline context. */ + int state_newline; /* New state on a newline transition. */ + int wants_letter; /* New state wants to know letter context. */ + int state_letter; /* New state on a letter transition. */ + static initialized; /* Flag for static initialization. */ + int i, j, k; + + /* Initialize the set of letters, if necessary. */ + if (! initialized) + { + initialized = 1; + for (i = 0; i < NOTCHAR; ++i) + if (ISALNUM(i)) + setbit(i, letters); + setbit('\n', newline); + } + + zeroset(matches); + + for (i = 0; i < d->states[s].elems.nelem; ++i) + { + pos = d->states[s].elems.elems[i]; + if (d->tokens[pos.index] >= 0 && d->tokens[pos.index] < NOTCHAR) + setbit(d->tokens[pos.index], matches); + else if (d->tokens[pos.index] >= CSET) + copyset(d->charclasses[d->tokens[pos.index] - CSET], matches); + else + continue; + + /* Some characters may need to be eliminated from matches because + they fail in the current context. */ + if (pos.constraint != 0xFF) + { + if (! MATCHES_NEWLINE_CONTEXT(pos.constraint, + d->states[s].newline, 1)) + clrbit('\n', matches); + if (! MATCHES_NEWLINE_CONTEXT(pos.constraint, + d->states[s].newline, 0)) + for (j = 0; j < CHARCLASS_INTS; ++j) + matches[j] &= newline[j]; + if (! MATCHES_LETTER_CONTEXT(pos.constraint, + d->states[s].letter, 1)) + for (j = 0; j < CHARCLASS_INTS; ++j) + matches[j] &= ~letters[j]; + if (! MATCHES_LETTER_CONTEXT(pos.constraint, + d->states[s].letter, 0)) + for (j = 0; j < CHARCLASS_INTS; ++j) + matches[j] &= letters[j]; + + /* If there are no characters left, there's no point in going on. */ + for (j = 0; j < CHARCLASS_INTS && !matches[j]; ++j) + ; + if (j == CHARCLASS_INTS) + continue; + } + + for (j = 0; j < ngrps; ++j) + { + /* If matches contains a single character only, and the current + group's label doesn't contain that character, go on to the + next group. */ + if (d->tokens[pos.index] >= 0 && d->tokens[pos.index] < NOTCHAR + && !tstbit(d->tokens[pos.index], labels[j])) + continue; + + /* Check if this group's label has a nonempty intersection with + matches. */ + intersectf = 0; + for (k = 0; k < CHARCLASS_INTS; ++k) + (intersect[k] = matches[k] & labels[j][k]) ? intersectf = 1 : 0; + if (! intersectf) + continue; + + /* It does; now find the set differences both ways. */ + leftoversf = matchesf = 0; + for (k = 0; k < CHARCLASS_INTS; ++k) + { + /* Even an optimizing compiler can't know this for sure. */ + int match = matches[k], label = labels[j][k]; + + (leftovers[k] = ~match & label) ? leftoversf = 1 : 0; + (matches[k] = match & ~label) ? matchesf = 1 : 0; + } + + /* If there were leftovers, create a new group labeled with them. */ + if (leftoversf) + { + copyset(leftovers, labels[ngrps]); + copyset(intersect, labels[j]); + MALLOC(grps[ngrps].elems, position, d->nleaves); + copy(&grps[j], &grps[ngrps]); + ++ngrps; + } + + /* Put the position in the current group. Note that there is no + reason to call insert() here. */ + grps[j].elems[grps[j].nelem++] = pos; + + /* If every character matching the current position has been + accounted for, we're done. */ + if (! matchesf) + break; + } + + /* If we've passed the last group, and there are still characters + unaccounted for, then we'll have to create a new group. */ + if (j == ngrps) + { + copyset(matches, labels[ngrps]); + zeroset(matches); + MALLOC(grps[ngrps].elems, position, d->nleaves); + grps[ngrps].nelem = 1; + grps[ngrps].elems[0] = pos; + ++ngrps; + } + } + + MALLOC(follows.elems, position, d->nleaves); + MALLOC(tmp.elems, position, d->nleaves); + + /* If we are a searching matcher, the default transition is to a state + containing the positions of state 0, otherwise the default transition + is to fail miserably. */ + if (d->searchflag) + { + wants_newline = 0; + wants_letter = 0; + for (i = 0; i < d->states[0].elems.nelem; ++i) + { + if (PREV_NEWLINE_DEPENDENT(d->states[0].elems.elems[i].constraint)) + wants_newline = 1; + if (PREV_LETTER_DEPENDENT(d->states[0].elems.elems[i].constraint)) + wants_letter = 1; + } + copy(&d->states[0].elems, &follows); + state = state_index(d, &follows, 0, 0); + if (wants_newline) + state_newline = state_index(d, &follows, 1, 0); + else + state_newline = state; + if (wants_letter) + state_letter = state_index(d, &follows, 0, 1); + else + state_letter = state; + for (i = 0; i < NOTCHAR; ++i) + if (i == '\n') + trans[i] = state_newline; + else if (ISALNUM(i)) + trans[i] = state_letter; + else + trans[i] = state; + } + else + for (i = 0; i < NOTCHAR; ++i) + trans[i] = -1; + + for (i = 0; i < ngrps; ++i) + { + follows.nelem = 0; + + /* Find the union of the follows of the positions of the group. + This is a hideously inefficient loop. Fix it someday. */ + for (j = 0; j < grps[i].nelem; ++j) + for (k = 0; k < d->follows[grps[i].elems[j].index].nelem; ++k) + insert(d->follows[grps[i].elems[j].index].elems[k], &follows); + + /* If we are building a searching matcher, throw in the positions + of state 0 as well. */ + if (d->searchflag) + for (j = 0; j < d->states[0].elems.nelem; ++j) + insert(d->states[0].elems.elems[j], &follows); + + /* Find out if the new state will want any context information. */ + wants_newline = 0; + if (tstbit('\n', labels[i])) + for (j = 0; j < follows.nelem; ++j) + if (PREV_NEWLINE_DEPENDENT(follows.elems[j].constraint)) + wants_newline = 1; + + wants_letter = 0; + for (j = 0; j < CHARCLASS_INTS; ++j) + if (labels[i][j] & letters[j]) + break; + if (j < CHARCLASS_INTS) + for (j = 0; j < follows.nelem; ++j) + if (PREV_LETTER_DEPENDENT(follows.elems[j].constraint)) + wants_letter = 1; + + /* Find the state(s) corresponding to the union of the follows. */ + state = state_index(d, &follows, 0, 0); + if (wants_newline) + state_newline = state_index(d, &follows, 1, 0); + else + state_newline = state; + if (wants_letter) + state_letter = state_index(d, &follows, 0, 1); + else + state_letter = state; + + /* Set the transitions for each character in the current label. */ + for (j = 0; j < CHARCLASS_INTS; ++j) + for (k = 0; k < INTBITS; ++k) + if (labels[i][j] & 1 << k) + { + int c = j * INTBITS + k; + + if (c == '\n') + trans[c] = state_newline; + else if (ISALNUM(c)) + trans[c] = state_letter; + else if (c < NOTCHAR) + trans[c] = state; + } + } + + for (i = 0; i < ngrps; ++i) + free(grps[i].elems); + free(follows.elems); + free(tmp.elems); + } + + /* Some routines for manipulating a compiled dfa's transition tables. + Each state may or may not have a transition table; if it does, and it + is a non-accepting state, then d->trans[state] points to its table. + If it is an accepting state then d->fails[state] points to its table. + If it has no table at all, then d->trans[state] is NULL. + TODO: Improve this comment, get rid of the unnecessary redundancy. */ + + static void + build_state(s, d) + int s; + struct dfa *d; + { + int *trans; /* The new transition table. */ + int i; + + /* Set an upper limit on the number of transition tables that will ever + exist at once. 1024 is arbitrary. The idea is that the frequently + used transition tables will be quickly rebuilt, whereas the ones that + were only needed once or twice will be cleared away. */ + if (d->trcount >= 1024) + { + for (i = 0; i < d->tralloc; ++i) + if (d->trans[i]) + { + free((ptr_t) d->trans[i]); + d->trans[i] = NULL; + } + else if (d->fails[i]) + { + free((ptr_t) d->fails[i]); + d->fails[i] = NULL; + } + d->trcount = 0; + } + + ++d->trcount; + + /* Set up the success bits for this state. */ + d->success[s] = 0; + if (ACCEPTS_IN_CONTEXT(d->states[s].newline, 1, d->states[s].letter, 0, + s, *d)) + d->success[s] |= 4; + if (ACCEPTS_IN_CONTEXT(d->states[s].newline, 0, d->states[s].letter, 1, + s, *d)) + d->success[s] |= 2; + if (ACCEPTS_IN_CONTEXT(d->states[s].newline, 0, d->states[s].letter, 0, + s, *d)) + d->success[s] |= 1; + + MALLOC(trans, int, NOTCHAR); + dfastate(s, d, trans); + + /* Now go through the new transition table, and make sure that the trans + and fail arrays are allocated large enough to hold a pointer for the + largest state mentioned in the table. */ + for (i = 0; i < NOTCHAR; ++i) + if (trans[i] >= d->tralloc) + { + int oldalloc = d->tralloc; + + while (trans[i] >= d->tralloc) + d->tralloc *= 2; + REALLOC(d->realtrans, int *, d->tralloc + 1); + d->trans = d->realtrans + 1; + REALLOC(d->fails, int *, d->tralloc); + REALLOC(d->success, int, d->tralloc); + REALLOC(d->newlines, int, d->tralloc); + while (oldalloc < d->tralloc) + { + d->trans[oldalloc] = NULL; + d->fails[oldalloc++] = NULL; + } + } + + /* Keep the newline transition in a special place so we can use it as + a sentinel. */ + d->newlines[s] = trans['\n']; + trans['\n'] = -1; + + if (ACCEPTING(s, *d)) + d->fails[s] = trans; + else + d->trans[s] = trans; + } + + static void + build_state_zero(d) + struct dfa *d; + { + d->tralloc = 1; + d->trcount = 0; + CALLOC(d->realtrans, int *, d->tralloc + 1); + d->trans = d->realtrans + 1; + CALLOC(d->fails, int *, d->tralloc); + MALLOC(d->success, int, d->tralloc); + MALLOC(d->newlines, int, d->tralloc); + build_state(0, d); + } + + /* Search through a buffer looking for a match to the given struct dfa. + Find the first occurrence of a string matching the regexp in the buffer, + and the shortest possible version thereof. Return a pointer to the first + character after the match, or NULL if none is found. Begin points to + the beginning of the buffer, and end points to the first character after + its end. We store a newline in *end to act as a sentinel, so end had + better point somewhere valid. Newline is a flag indicating whether to + allow newlines to be in the matching string. If count is non- + NULL it points to a place we're supposed to increment every time we + see a newline. Finally, if backref is non-NULL it points to a place + where we're supposed to store a 1 if backreferencing happened and the + match needs to be verified by a backtracking matcher. Otherwise + we store a 0 in *backref. */ + char * + dfaexec(d, begin, end, newline, count, backref) + struct dfa *d; + char *begin; + char *end; + int newline; + int *count; + int *backref; + { + register s, s1, tmp; /* Current state. */ + register unsigned char *p; /* Current input character. */ + register **trans, *t; /* Copy of d->trans so it can be optimized + into a register. */ + static sbit[NOTCHAR]; /* Table for anding with d->success. */ + static sbit_init; + + if (! sbit_init) + { + int i; + + sbit_init = 1; + for (i = 0; i < NOTCHAR; ++i) + if (i == '\n') + sbit[i] = 4; + else if (ISALNUM(i)) + sbit[i] = 2; + else + sbit[i] = 1; + } + + if (! d->tralloc) + build_state_zero(d); + + s = s1 = 0; + p = (unsigned char *) begin; + trans = d->trans; + *end = '\n'; + + for (;;) + { + /* The dreaded inner loop. */ + if ((t = trans[s]) != 0) + do + { + s1 = t[*p++]; + if (! (t = trans[s1])) + goto last_was_s; + s = t[*p++]; + } + while ((t = trans[s]) != 0); + goto last_was_s1; + last_was_s: + tmp = s, s = s1, s1 = tmp; + last_was_s1: + + if (s >= 0 && p <= (unsigned char *) end && d->fails[s]) + { + if (d->success[s] & sbit[*p]) + { + if (backref) + if (d->states[s].backref) + *backref = 1; + else + *backref = 0; + return (char *) p; + } + + s1 = s; + s = d->fails[s][*p++]; + continue; + } + + /* If the previous character was a newline, count it. */ + if (count && (char *) p <= end && p[-1] == '\n') + ++*count; + + /* Check if we've run off the end of the buffer. */ + if ((char *) p > end) + return NULL; + + if (s >= 0) + { + build_state(s, d); + trans = d->trans; + continue; + } + + if (p[-1] == '\n' && newline) + { + s = d->newlines[s1]; + continue; + } + + s = 0; + } + } + + /* Initialize the components of a dfa that the other routines don't + initialize for themselves. */ + void + dfainit(d) + struct dfa *d; + { + d->calloc = 1; + MALLOC(d->charclasses, charclass, d->calloc); + d->cindex = 0; + + d->talloc = 1; + MALLOC(d->tokens, token, d->talloc); + d->tindex = d->depth = d->nleaves = d->nregexps = 0; + + d->searchflag = 0; + d->tralloc = 0; + + d->musts = 0; + } + + /* Parse and analyze a single string of the given length. */ + void + dfacomp(s, len, d, searchflag) + char *s; + size_t len; + struct dfa *d; + int searchflag; + { + if (case_fold) /* dummy folding in service of dfamust() */ + { + char *lcopy, *p; + int i; + + p = lcopy = malloc(len + 7); + if (!lcopy) + fprintf(stderr,"out of memory"); + + /* This is a kludge. */ + case_fold = 0; + if (current_mbctype != MBCTYPE_ASCII && searchflag) { + *p++ = '^'; + *p++ = '.'; + *p++ = '*'; + if (!(syntax_bits & RE_NO_BK_PARENS)) + *p++ = '\\'; + *p++ = '('; + } + for (i = 0; i < len; ++i) + if (ISUPPER(s[i])) + *p++ = tolower((unsigned char)s[i]); + else + if (ismbchar(*p++ = s[i]) && ++i < len) + *p++ = s[i]; + if (current_mbctype != MBCTYPE_ASCII && searchflag) { + if (!(syntax_bits & RE_NO_BK_PARENS)) + *p++ = '\\'; + *p++ = ')'; + } + + dfainit(d); + dfaparse(lcopy, p - lcopy, d); + dfamust(d); + d->cindex = d->tindex = d->depth = d->nleaves = d->nregexps = 0; + case_fold = 1; + if (current_mbctype != MBCTYPE_ASCII && searchflag) { + bcopy(s, lcopy + (syntax_bits & RE_NO_BK_PARENS ? 4 : 5), len); + dfaparse(lcopy, p - lcopy, d); + free(lcopy); + } + else { + free(lcopy); + dfaparse(s, len, d); + } + dfaanalyze(d, searchflag); + } + else + { + dfainit(d); + if (current_mbctype != MBCTYPE_ASCII && searchflag) { + char *lcopy, *p; + + p = lcopy = malloc(len + 7); + *p++ = '^'; + *p++ = '.'; + *p++ = '*'; + if (!(syntax_bits & RE_NO_BK_PARENS)) + *p++ = '\\'; + *p++ = '('; + bcopy(s, p, len); + p += len; + if (!(syntax_bits & RE_NO_BK_PARENS)) + *p++ = '\\'; + *p++ = ')'; + dfaparse(lcopy, p - lcopy, d); + free(lcopy); + } + else + dfaparse(s, len, d); + dfamust(d); + dfaanalyze(d, searchflag); + } + } + + /* Free the storage held by the components of a dfa. */ + void + dfafree(d) + struct dfa *d; + { + int i; + struct dfamust *dm, *ndm; + + free((ptr_t) d->charclasses); + free((ptr_t) d->tokens); + for (i = 0; i < d->sindex; ++i) + free((ptr_t) d->states[i].elems.elems); + free((ptr_t) d->states); + for (i = 0; i < d->tindex; ++i) + if (d->follows[i].elems) + free((ptr_t) d->follows[i].elems); + free((ptr_t) d->follows); + for (i = 0; i < d->tralloc; ++i) + if (d->trans[i]) + free((ptr_t) d->trans[i]); + else if (d->fails[i]) + free((ptr_t) d->fails[i]); + free((ptr_t) d->realtrans); + free((ptr_t) d->fails); + free((ptr_t) d->newlines); + for (dm = d->musts; dm; dm = ndm) + { + ndm = dm->next; + free(dm->must); + free((ptr_t) dm); + } + } + + /* Having found the postfix representation of the regular expression, + try to find a long sequence of characters that must appear in any line + containing the r.e. + Finding a "longest" sequence is beyond the scope here; + we take an easy way out and hope for the best. + (Take "(ab|a)b"--please.) + + We do a bottom-up calculation of sequences of characters that must appear + in matches of r.e.'s represented by trees rooted at the nodes of the postfix + representation: + sequences that must appear at the left of the match ("left") + sequences that must appear at the right of the match ("right") + lists of sequences that must appear somewhere in the match ("in") + sequences that must constitute the match ("is") + + When we get to the root of the tree, we use one of the longest of its + calculated "in" sequences as our answer. The sequence we find is returned in + d->must (where "d" is the single argument passed to "dfamust"); + the length of the sequence is returned in d->mustn. + + The sequences calculated for the various types of node (in pseudo ANSI c) + are shown below. "p" is the operand of unary operators (and the left-hand + operand of binary operators); "q" is the right-hand operand of binary + operators. + + "ZERO" means "a zero-length sequence" below. + + Type left right is in + ---- ---- ----- -- -- + char c # c # c # c # c + + CSET ZERO ZERO ZERO ZERO + + STAR ZERO ZERO ZERO ZERO + + QMARK ZERO ZERO ZERO ZERO + + PLUS p->left p->right ZERO p->in + + CAT (p->is==ZERO)? (q->is==ZERO)? (p->is!=ZERO && p->in plus + p->left : q->right : q->is!=ZERO) ? q->in plus + p->is##q->left p->right##q->is p->is##q->is : p->right##q->left + ZERO + + OR longest common longest common (do p->is and substrings common to + leading trailing q->is have same p->in and q->in + (sub)sequence (sub)sequence length and + of p->left of p->right content) ? + and q->left and q->right p->is : NULL + + If there's anything else we recognize in the tree, all four sequences get set + to zero-length sequences. If there's something we don't recognize in the tree, + we just return a zero-length sequence. + + Break ties in favor of infrequent letters (choosing 'zzz' in preference to + 'aaa')? + + And. . .is it here or someplace that we might ponder "optimizations" such as + egrep 'psi|epsilon' -> egrep 'psi' + egrep 'pepsi|epsilon' -> egrep 'epsi' + (Yes, we now find "epsi" as a "string + that must occur", but we might also + simplify the *entire* r.e. being sought) + grep '[c]' -> grep 'c' + grep '(ab|a)b' -> grep 'ab' + grep 'ab*' -> grep 'a' + grep 'a*b' -> grep 'b' + + There are several issues: + + Is optimization easy (enough)? + + Does optimization actually accomplish anything, + or is the automaton you get from "psi|epsilon" (for example) + the same as the one you get from "psi" (for example)? + + Are optimizable r.e.'s likely to be used in real-life situations + (something like 'ab*' is probably unlikely; something like is + 'psi|epsilon' is likelier)? */ + + static char * + icatalloc(old, new) + char *old; + char *new; + { + char *result; + size_t oldsize, newsize; + + newsize = (new == NULL) ? 0 : strlen(new); + if (old == NULL) + oldsize = 0; + else if (newsize == 0) + return old; + else oldsize = strlen(old); + if (old == NULL) + result = (char *) malloc(newsize + 1); + else + result = (char *) realloc((void *) old, oldsize + newsize + 1); + if (result != NULL && new != NULL) + (void) strcpy(result + oldsize, new); + return result; + } + + static char * + icpyalloc(string) + char *string; + { + return icatalloc((char *) NULL, string); + } + + static char * + istrstr(lookin, lookfor) + char *lookin; + char *lookfor; + { + char *cp; + size_t len; + + len = strlen(lookfor); + for (cp = lookin; *cp != '\0'; ++cp) + if (strncmp(cp, lookfor, len) == 0) + return cp; + return NULL; + } + + static void + ifree(cp) + char *cp; + { + if (cp != NULL) + free(cp); + } + + static void + freelist(cpp) + char **cpp; + { + int i; + + if (cpp == NULL) + return; + for (i = 0; cpp[i] != NULL; ++i) + { + free(cpp[i]); + cpp[i] = NULL; + } + } + + static char ** + enlist(cpp, new, len) + char **cpp; + char *new; + size_t len; + { + int i, j; + + if (cpp == NULL) + return NULL; + if ((new = icpyalloc(new)) == NULL) + { + freelist(cpp); + return NULL; + } + new[len] = '\0'; + /* Is there already something in the list that's new (or longer)? */ + for (i = 0; cpp[i] != NULL; ++i) + if (istrstr(cpp[i], new) != NULL) + { + free(new); + return cpp; + } + /* Eliminate any obsoleted strings. */ + j = 0; + while (cpp[j] != NULL) + if (istrstr(new, cpp[j]) == NULL) + ++j; + else + { + free(cpp[j]); + if (--i == j) + break; + cpp[j] = cpp[i]; + cpp[i] = NULL; + } + /* Add the new string. */ + cpp = (char **) realloc((char *) cpp, (i + 2) * sizeof *cpp); + if (cpp == NULL) + return NULL; + cpp[i] = new; + cpp[i + 1] = NULL; + return cpp; + } + + /* Given pointers to two strings, return a pointer to an allocated + list of their distinct common substrings. Return NULL if something + seems wild. */ + static char ** + comsubs(left, right) + char *left; + char *right; + { + char **cpp; + char *lcp; + char *rcp; + size_t i, len; + + if (left == NULL || right == NULL) + return NULL; + cpp = (char **) malloc(sizeof *cpp); + if (cpp == NULL) + return NULL; + cpp[0] = NULL; + for (lcp = left; *lcp != '\0'; ++lcp) + { + len = 0; + rcp = index(right, *lcp); + while (rcp != NULL) + { + for (i = 1; lcp[i] != '\0' && lcp[i] == rcp[i]; ++i) + ; + if (i > len) + len = i; + rcp = index(rcp + 1, *lcp); + } + if (len == 0) + continue; + if ((cpp = enlist(cpp, lcp, len)) == NULL) + break; + } + return cpp; + } + + static char ** + addlists(old, new) + char **old; + char **new; + { + int i; + + if (old == NULL || new == NULL) + return NULL; + for (i = 0; new[i] != NULL; ++i) + { + old = enlist(old, new[i], strlen(new[i])); + if (old == NULL) + break; + } + return old; + } + + /* Given two lists of substrings, return a new list giving substrings + common to both. */ + static char ** + inboth(left, right) + char **left; + char **right; + { + char **both; + char **temp; + int lnum, rnum; + + if (left == NULL || right == NULL) + return NULL; + both = (char **) malloc(sizeof *both); + if (both == NULL) + return NULL; + both[0] = NULL; + for (lnum = 0; left[lnum] != NULL; ++lnum) + { + for (rnum = 0; right[rnum] != NULL; ++rnum) + { + temp = comsubs(left[lnum], right[rnum]); + if (temp == NULL) + { + freelist(both); + return NULL; + } + both = addlists(both, temp); + freelist(temp); + if (both == NULL) + return NULL; + } + } + return both; + } + + typedef struct + { + char **in; + char *left; + char *right; + char *is; + } must; + + static void + resetmust(mp) + must *mp; + { + mp->left[0] = mp->right[0] = mp->is[0] = '\0'; + freelist(mp->in); + } + + static void + dfamust(dfa) + struct dfa *dfa; + { + must *musts; + must *mp; + char *result; + int ri; + int i; + int exact; + token t; + static must must0; + struct dfamust *dm; + static char empty_string[] = ""; + + result = empty_string; + exact = 0; + musts = (must *) malloc((dfa->tindex + 1) * sizeof *musts); + if (musts == NULL) + return; + mp = musts; + for (i = 0; i <= dfa->tindex; ++i) + mp[i] = must0; + for (i = 0; i <= dfa->tindex; ++i) + { + mp[i].in = (char **) malloc(sizeof *mp[i].in); + mp[i].left = malloc(2); + mp[i].right = malloc(2); + mp[i].is = malloc(2); + if (mp[i].in == NULL || mp[i].left == NULL || + mp[i].right == NULL || mp[i].is == NULL) + goto done; + mp[i].left[0] = mp[i].right[0] = mp[i].is[0] = '\0'; + mp[i].in[0] = NULL; + } + #ifdef DEBUG + fprintf(stderr, "dfamust:\n"); + for (i = 0; i < dfa->tindex; ++i) + { + fprintf(stderr, " %d:", i); + prtok(dfa->tokens[i]); + } + putc('\n', stderr); + #endif + for (ri = 0; ri < dfa->tindex; ++ri) + { + switch (t = dfa->tokens[ri]) + { + case LPAREN: + case RPAREN: + goto done; /* "cannot happen" */ + case EMPTY: + case BEGLINE: + case ENDLINE: + case BEGWORD: + case ENDWORD: + case LIMWORD: + case NOTLIMWORD: + case BACKREF: + resetmust(mp); + break; + case STAR: + case QMARK: + if (mp <= musts) + goto done; /* "cannot happen" */ + --mp; + resetmust(mp); + break; + case OR: + case ORTOP: + if (mp < &musts[2]) + goto done; /* "cannot happen" */ + { + char **new; + must *lmp; + must *rmp; + int j, ln, rn, n; + + rmp = --mp; + lmp = --mp; + /* Guaranteed to be. Unlikely, but. . . */ + if (strcmp(lmp->is, rmp->is) != 0) + lmp->is[0] = '\0'; + /* Left side--easy */ + i = 0; + while (lmp->left[i] != '\0' && lmp->left[i] == rmp->left[i]) + ++i; + lmp->left[i] = '\0'; + /* Right side */ + ln = strlen(lmp->right); + rn = strlen(rmp->right); + n = ln; + if (n > rn) + n = rn; + for (i = 0; i < n; ++i) + if (lmp->right[ln - i - 1] != rmp->right[rn - i - 1]) + break; + for (j = 0; j < i; ++j) + lmp->right[j] = lmp->right[(ln - i) + j]; + lmp->right[j] = '\0'; + new = inboth(lmp->in, rmp->in); + if (new == NULL) + goto done; + freelist(lmp->in); + free((char *) lmp->in); + lmp->in = new; + } + break; + case PLUS: + if (mp <= musts) + goto done; /* "cannot happen" */ + --mp; + mp->is[0] = '\0'; + break; + case END: + if (mp != &musts[1]) + goto done; /* "cannot happen" */ + for (i = 0; musts[0].in[i] != NULL; ++i) + if (strlen(musts[0].in[i]) > strlen(result)) + result = musts[0].in[i]; + if (strcmp(result, musts[0].is) == 0) + exact = 1; + goto done; + case CAT: + if (mp < &musts[2]) + goto done; /* "cannot happen" */ + { + must *lmp; + must *rmp; + + rmp = --mp; + lmp = --mp; + /* In. Everything in left, plus everything in + right, plus catenation of + left's right and right's left. */ + lmp->in = addlists(lmp->in, rmp->in); + if (lmp->in == NULL) + goto done; + if (lmp->right[0] != '\0' && + rmp->left[0] != '\0') + { + char *tp; + + tp = icpyalloc(lmp->right); + if (tp == NULL) + goto done; + tp = icatalloc(tp, rmp->left); + if (tp == NULL) + goto done; + lmp->in = enlist(lmp->in, tp, + strlen(tp)); + free(tp); + if (lmp->in == NULL) + goto done; + } + /* Left-hand */ + if (lmp->is[0] != '\0') + { + lmp->left = icatalloc(lmp->left, + rmp->left); + if (lmp->left == NULL) + goto done; + } + /* Right-hand */ + if (rmp->is[0] == '\0') + lmp->right[0] = '\0'; + lmp->right = icatalloc(lmp->right, rmp->right); + if (lmp->right == NULL) + goto done; + /* Guaranteed to be */ + if (lmp->is[0] != '\0' && rmp->is[0] != '\0') + { + lmp->is = icatalloc(lmp->is, rmp->is); + if (lmp->is == NULL) + goto done; + } + else + lmp->is[0] = '\0'; + } + break; + default: + if (t < END) + { + /* "cannot happen" */ + goto done; + } + else if (t == '\0') + { + /* not on *my* shift */ + goto done; + } + else if (t >= CSET) + { + /* easy enough */ + resetmust(mp); + } + else + { + /* plain character */ + resetmust(mp); + mp->is[0] = mp->left[0] = mp->right[0] = t; + mp->is[1] = mp->left[1] = mp->right[1] = '\0'; + mp->in = enlist(mp->in, mp->is, (size_t)1); + if (mp->in == NULL) + goto done; + } + break; + } + #ifdef DEBUG + fprintf(stderr, " node: %d:", ri); + prtok(dfa->tokens[ri]); + fprintf(stderr, "\n in:"); + for (i = 0; mp->in[i]; ++i) + fprintf(stderr, " \"%s\"", mp->in[i]); + fprintf(stderr, "\n is: \"%s\"\n", mp->is); + fprintf(stderr, " left: \"%s\"\n", mp->left); + fprintf(stderr, " right: \"%s\"\n", mp->right); + #endif + ++mp; + } + done: + if (strlen(result)) + { + dm = (struct dfamust *) malloc(sizeof (struct dfamust)); + dm->exact = exact; + dm->must = malloc(strlen(result) + 1); + strcpy(dm->must, result); + dm->next = dfa->musts; + dfa->musts = dm; + } + mp = musts; + for (i = 0; i <= dfa->tindex; ++i) + { + freelist(mp[i].in); + ifree((char *) mp[i].in); + ifree(mp[i].left); + ifree(mp[i].right); + ifree(mp[i].is); + } + free((char *) mp); + } diff -crP php-2.0.1/src/jp.regex/dfa.h php-2.0.1.jp_urat-5.3/src/jp.regex/dfa.h *** php-2.0.1/src/jp.regex/dfa.h Thu Jan 1 09:00:00 1970 --- php-2.0.1.jp_urat-5.3/src/jp.regex/dfa.h Wed Feb 18 21:03:40 1998 *************** *** 0 **** --- 1,360 ---- + /* dfa.h - declarations for GNU deterministic regexp compiler + Copyright (C) 1988 Free Software Foundation, Inc. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ + + /* Written June, 1988 by Mike Haertel */ + + /* FIXME: + 2. We should not export so much of the DFA internals. + In addition to clobbering modularity, we eat up valuable + name space. */ + + /* Number of bits in an unsigned char. */ + #define CHARBITS 8 + + /* First integer value that is greater than any character code. */ + #define NOTCHAR (1 << CHARBITS) + + /* INTBITS need not be exact, just a lower bound. */ + #define INTBITS (CHARBITS * sizeof (int)) + + /* Number of ints required to hold a bit for every character. */ + #define CHARCLASS_INTS ((NOTCHAR + INTBITS - 1) / INTBITS) + + /* Sets of unsigned characters are stored as bit vectors in arrays of ints. */ + typedef int charclass[CHARCLASS_INTS]; + + /* The regexp is parsed into an array of tokens in postfix form. Some tokens + are operators and others are terminal symbols. Most (but not all) of these + codes are returned by the lexical analyzer. */ + + typedef enum + { + END = -1, /* END is a terminal symbol that matches the + end of input; any value of END or less in + the parse tree is such a symbol. Accepting + states of the DFA are those that would have + a transition on END. */ + + /* Ordinary character values are terminal symbols that match themselves. */ + + EMPTY = NOTCHAR, /* EMPTY is a terminal symbol that matches + the empty string. */ + + BACKREF, /* BACKREF is generated by \; it + it not completely handled. If the scanner + detects a transition on backref, it returns + a kind of "semi-success" indicating that + the match will have to be verified with + a backtracking matcher. */ + + BEGLINE, /* BEGLINE is a terminal symbol that matches + the empty string if it is at the beginning + of a line. */ + + ENDLINE, /* ENDLINE is a terminal symbol that matches + the empty string if it is at the end of + a line. */ + + BEGWORD, /* BEGWORD is a terminal symbol that matches + the empty string if it is at the beginning + of a word. */ + + ENDWORD, /* ENDWORD is a terminal symbol that matches + the empty string if it is at the end of + a word. */ + + LIMWORD, /* LIMWORD is a terminal symbol that matches + the empty string if it is at the beginning + or the end of a word. */ + + NOTLIMWORD, /* NOTLIMWORD is a terminal symbol that + matches the empty string if it is not at + the beginning or end of a word. */ + + QMARK, /* QMARK is an operator of one argument that + matches zero or one occurences of its + argument. */ + + STAR, /* STAR is an operator of one argument that + matches the Kleene closure (zero or more + occurrences) of its argument. */ + + PLUS, /* PLUS is an operator of one argument that + matches the positive closure (one or more + occurrences) of its argument. */ + + REPMN, /* REPMN is a lexical token corresponding + to the {m,n} construct. REPMN never + appears in the compiled token vector. */ + + CAT, /* CAT is an operator of two arguments that + matches the concatenation of its + arguments. CAT is never returned by the + lexical analyzer. */ + + OR, /* OR is an operator of two arguments that + matches either of its arguments. */ + + ORTOP, /* OR at the toplevel in the parse tree. + This is used for a boyer-moore heuristic. */ + + LPAREN, /* LPAREN never appears in the parse tree, + it is only a lexeme. */ + + RPAREN, /* RPAREN never appears in the parse tree. */ + + CSET /* CSET and (and any value greater) is a + terminal symbol that matches any of a + class of characters. */ + } token; + + /* Sets are stored in an array in the compiled dfa; the index of the + array corresponding to a given set token is given by SET_INDEX(t). */ + #define SET_INDEX(t) ((t) - CSET) + + /* Sometimes characters can only be matched depending on the surrounding + context. Such context decisions depend on what the previous character + was, and the value of the current (lookahead) character. Context + dependent constraints are encoded as 8 bit integers. Each bit that + is set indicates that the constraint succeeds in the corresponding + context. + + bit 7 - previous and current are newlines + bit 6 - previous was newline, current isn't + bit 5 - previous wasn't newline, current is + bit 4 - neither previous nor current is a newline + bit 3 - previous and current are word-constituents + bit 2 - previous was word-constituent, current isn't + bit 1 - previous wasn't word-constituent, current is + bit 0 - neither previous nor current is word-constituent + + Word-constituent characters are those that satisfy isalnum(). + + The macro SUCCEEDS_IN_CONTEXT determines whether a a given constraint + succeeds in a particular context. Prevn is true if the previous character + was a newline, currn is true if the lookahead character is a newline. + Prevl and currl similarly depend upon whether the previous and current + characters are word-constituent letters. */ + #define MATCHES_NEWLINE_CONTEXT(constraint, prevn, currn) \ + ((constraint) & 1 << (((prevn) ? 2 : 0) + ((currn) ? 1 : 0) + 4)) + #define MATCHES_LETTER_CONTEXT(constraint, prevl, currl) \ + ((constraint) & 1 << (((prevl) ? 2 : 0) + ((currl) ? 1 : 0))) + #define SUCCEEDS_IN_CONTEXT(constraint, prevn, currn, prevl, currl) \ + (MATCHES_NEWLINE_CONTEXT(constraint, prevn, currn) \ + && MATCHES_LETTER_CONTEXT(constraint, prevl, currl)) + + /* The following macros give information about what a constraint depends on. */ + #define PREV_NEWLINE_DEPENDENT(constraint) \ + (((constraint) & 0xc0) >> 2 != ((constraint) & 0x30)) + #define PREV_LETTER_DEPENDENT(constraint) \ + (((constraint) & 0x0c) >> 2 != ((constraint) & 0x03)) + + /* Tokens that match the empty string subject to some constraint actually + work by applying that constraint to determine what may follow them, + taking into account what has gone before. The following values are + the constraints corresponding to the special tokens previously defined. */ + #define NO_CONSTRAINT 0xff + #define BEGLINE_CONSTRAINT 0xcf + #define ENDLINE_CONSTRAINT 0xaf + #define BEGWORD_CONSTRAINT 0xf2 + #define ENDWORD_CONSTRAINT 0xf4 + #define LIMWORD_CONSTRAINT 0xf6 + #define NOTLIMWORD_CONSTRAINT 0xf9 + + /* States of the recognizer correspond to sets of positions in the parse + tree, together with the constraints under which they may be matched. + So a position is encoded as an index into the parse tree together with + a constraint. */ + typedef struct + { + unsigned index; /* Index into the parse array. */ + unsigned constraint; /* Constraint for matching this position. */ + } position; + + /* Sets of positions are stored as arrays. */ + typedef struct + { + position *elems; /* Elements of this position set. */ + int nelem; /* Number of elements in this set. */ + } position_set; + + /* A state of the dfa consists of a set of positions, some flags, + and the token value of the lowest-numbered position of the state that + contains an END token. */ + typedef struct + { + int hash; /* Hash of the positions of this state. */ + position_set elems; /* Positions this state could match. */ + char newline; /* True if previous state matched newline. */ + char letter; /* True if previous state matched a letter. */ + char backref; /* True if this state matches a \. */ + unsigned char constraint; /* Constraint for this state to accept. */ + int first_end; /* Token value of the first END in elems. */ + } dfa_state; + + /* Element of a list of strings, at least one of which is known to + appear in any R.E. matching the DFA. */ + struct dfamust + { + int exact; + char *must; + struct dfamust *next; + }; + + /* A compiled regular expression. */ + struct dfa + { + /* Stuff built by the scanner. */ + charclass *charclasses; /* Array of character sets for CSET tokens. */ + int cindex; /* Index for adding new charclasses. */ + int calloc; /* Number of charclasses currently allocated. */ + + /* Stuff built by the parser. */ + token *tokens; /* Postfix parse array. */ + int tindex; /* Index for adding new tokens. */ + int talloc; /* Number of tokens currently allocated. */ + int depth; /* Depth required of an evaluation stack + used for depth-first traversal of the + parse tree. */ + int nleaves; /* Number of leaves on the parse tree. */ + int nregexps; /* Count of parallel regexps being built + with dfaparse(). */ + + /* Stuff owned by the state builder. */ + dfa_state *states; /* States of the dfa. */ + int sindex; /* Index for adding new states. */ + int salloc; /* Number of states currently allocated. */ + + /* Stuff built by the structure analyzer. */ + position_set *follows; /* Array of follow sets, indexed by position + index. The follow of a position is the set + of positions co