From owner-svn-soc-all@FreeBSD.ORG Sun Aug 14 14:27:56 2011 Return-Path: Delivered-To: svn-soc-all@FreeBSD.org Received: from socsvn.FreeBSD.org (unknown [IPv6:2001:4f8:fff6::2f]) by hub.freebsd.org (Postfix) with SMTP id C3E071065672 for ; Sun, 14 Aug 2011 14:27:54 +0000 (UTC) (envelope-from zy@FreeBSD.org) Received: by socsvn.FreeBSD.org (sSMTP sendmail emulation); Sun, 14 Aug 2011 14:27:54 +0000 Date: Sun, 14 Aug 2011 14:27:54 +0000 From: zy@FreeBSD.org To: svn-soc-all@FreeBSD.org MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Message-Id: <20110814142754.C3E071065672@hub.freebsd.org> Cc: Subject: socsvn commit: r225111 - in soc2011/zy/nvi-iconv/head: contrib/nvi2/common contrib/nvi2/ex contrib/nvi2/vi usr.bin/vi X-BeenThere: svn-soc-all@freebsd.org X-Mailman-Version: 2.1.5 Precedence: list List-Id: SVN commit messages for the entire Summer of Code repository List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Sun, 14 Aug 2011 14:27:56 -0000 Author: zy Date: Sun Aug 14 14:27:54 2011 New Revision: 225111 URL: http://svnweb.FreeBSD.org/socsvn/?view=rev&rev=225111 Log: Updates to git:0c15828; this version features: * Stable :vsplit support; * Standard and general file encoding detection (UTF-8 & UTF-16); * UTF-16 support. The patch is ready for testing in the community. Added: soc2011/zy/nvi-iconv/head/contrib/nvi2/common/encoding.c Modified: soc2011/zy/nvi-iconv/head/contrib/nvi2/common/conv.c soc2011/zy/nvi-iconv/head/contrib/nvi2/common/exf.c soc2011/zy/nvi-iconv/head/contrib/nvi2/common/exf.h soc2011/zy/nvi-iconv/head/contrib/nvi2/common/extern.h soc2011/zy/nvi-iconv/head/contrib/nvi2/common/line.c soc2011/zy/nvi-iconv/head/contrib/nvi2/ex/ex.h soc2011/zy/nvi-iconv/head/contrib/nvi2/ex/ex_write.c soc2011/zy/nvi-iconv/head/contrib/nvi2/vi/vi.c soc2011/zy/nvi-iconv/head/contrib/nvi2/vi/vs_refresh.c soc2011/zy/nvi-iconv/head/usr.bin/vi/Makefile Modified: soc2011/zy/nvi-iconv/head/contrib/nvi2/common/conv.c ============================================================================== --- soc2011/zy/nvi-iconv/head/contrib/nvi2/common/conv.c Sun Aug 14 13:37:38 2011 (r225110) +++ soc2011/zy/nvi-iconv/head/contrib/nvi2/common/conv.c Sun Aug 14 14:27:54 2011 (r225111) @@ -10,7 +10,7 @@ #include "config.h" #ifndef lint -static const char sccsid[] = "$Id: conv.c,v 1.28 2011/07/16 14:40:06 zy Exp $ (Berkeley) $Date: 2011/07/16 14:40:06 $"; +static const char sccsid[] = "$Id: conv.c,v 1.29 2011/08/13 12:53:23 zy Exp $ (Berkeley) $Date: 2011/08/13 12:53:23 $"; #endif /* not lint */ #include @@ -23,6 +23,7 @@ #include #include #include +#include #include #include "common.h" @@ -89,6 +90,10 @@ default_char2int(SCR *sp, const char * str, ssize_t len, CONVWIN *cw, size_t *tolen, CHAR_T **dst, char *enc) { + /* XXX UTF-16 linesep hack */ + if (!strncasecmp(enc, "utf-16", 6) && len % 2) + len -= 1; + int i = 0, j; CHAR_T **tostr = &cw->b_wc1; size_t *blen = &cw->blen1; Added: soc2011/zy/nvi-iconv/head/contrib/nvi2/common/encoding.c ============================================================================== --- /dev/null 00:00:00 1970 (empty, because file is newly added) +++ soc2011/zy/nvi-iconv/head/contrib/nvi2/common/encoding.c Sun Aug 14 14:27:54 2011 (r225111) @@ -0,0 +1,162 @@ +/*- + * Copyright (c) 2011 + * Zhihao Yuan. All rights reserved. + * + * See the LICENSE file for redistribution information. + */ + +#ifndef lint +static const char sccsid[] = "$Id: encoding.c,v 1.2 2011/08/13 22:58:03 zy Exp $ (Berkeley) $Date: 2011/08/13 22:58:03 $"; +#endif /* not lint */ + +#include + +#define F 0 /* character never appears in text */ +#define T 1 /* character appears in plain ASCII text */ +#define I 2 /* character appears in ISO-8859 text */ +#define X 3 /* character appears in non-ISO extended ASCII (Mac, IBM PC) */ + +static char text_chars[256] = { + /* BEL BS HT LF FF CR */ + F, F, F, F, F, F, F, T, T, T, T, F, T, T, F, F, /* 0x0X */ + /* ESC */ + F, F, F, F, F, F, F, F, F, F, F, T, F, F, F, F, /* 0x1X */ + T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x2X */ + T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x3X */ + T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x4X */ + T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x5X */ + T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x6X */ + T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F, /* 0x7X */ + /* NEL */ + X, X, X, X, X, T, X, X, X, X, X, X, X, X, X, X, /* 0x8X */ + X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, /* 0x9X */ + I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xaX */ + I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xbX */ + I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xcX */ + I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xdX */ + I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, /* 0xeX */ + I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I /* 0xfX */ +}; + +/* + * looks_utf8 -- + * Decide whether some text looks like UTF-8. Returns: + * + * -1: invalid UTF-8 + * 0: uses odd control characters, so doesn't look like text + * 1: 7-bit text + * 2: definitely UTF-8 text (valid high-bit set bytes) + * + * Based on RFC 3629. UTF-8 with BOM is not accepted. + * + * PUBLIC: int looks_utf8 __P((const char *, size_t)); + */ +int +looks_utf8(const char *buf, size_t nbytes) +{ + size_t i; + int n; + int gotone = 0, ctrl = 0; + + for (i = 0; i < nbytes; i++) { + if ((buf[i] & 0x80) == 0) { /* 0xxxxxxx is plain ASCII */ + /* + * Even if the whole file is valid UTF-8 sequences, + * still reject it if it uses weird control characters. + */ + + if (text_chars[(u_char)buf[i]] != T) + ctrl = 1; + } else if ((buf[i] & 0x40) == 0) { /* 10xxxxxx never 1st byte */ + return -1; + } else { /* 11xxxxxx begins UTF-8 */ + int following; + + if ((buf[i] & 0x20) == 0) /* 110xxxxx */ + if ((buf[1] & 0x3e)) /* C0, C1 */ + following = 1; + else return -1; + else if ((buf[i] & 0x10) == 0) /* 1110xxxx */ + following = 2; + else if ((buf[i] & 0x08) == 0) /* 11110xxx */ + if ((u_char)buf[i] < 0xf5 || 0xf7 < (u_char)buf[i]) + following = 3; + else return -1; /* F5, F6, F7 */ + else + return -1; + + for (n = 0; n < following; n++) { + i++; + if (i >= nbytes) + goto done; + + if (buf[i] & 0x40) /* 10xxxxxx */ + return -1; + } + + gotone = 1; + } + } +done: + return ctrl ? 0 : (gotone ? 2 : 1); +} + +/* + * looks_utf16 -- + * Decide whether some text looks like UTF-16. Returns: + * + * 0: invalid UTF-16 + * 1: Little-endian UTF-16 + * 2: Big-endian UTF-16 + * + * PUBLIC: int looks_utf16 __P((const char *, size_t)); + */ +int +looks_utf16(const char *buf, size_t nbytes) +{ + int bigend; + size_t i; + unsigned int c; + int bom; + int following = 0; + + if (nbytes < 2) + return 0; + + bom = ((u_char)buf[0] << 8) + (u_char)buf[1]; + if (bom == 0xfffe) + bigend = 0; + else if (bom == 0xfeff) + bigend = 1; + else + return 0; + + for (i = 2; i + 1 < nbytes; i += 2) { + if (bigend) + c = (u_char)buf[i + 1] + 256 * (u_char)buf[i]; + else + c = (u_char)buf[i] + 256 * (u_char)buf[i + 1]; + + if (!following) + if (c < 0xD800 || c > 0xDFFF) + if (c < 128 && text_chars[(size_t)c] != T) + return 0; + else + following = 0; + else if (!(0xD800 <= c && c <= 0xDBFF)) + return 0; + else { + following = 1; + continue; + } + else if (!(0xDC00 <= c && c <= 0xDFFF)) + return 0; + } + + return 1 + bigend; +} + +#undef F +#undef T +#undef I +#undef X Modified: soc2011/zy/nvi-iconv/head/contrib/nvi2/common/exf.c ============================================================================== --- soc2011/zy/nvi-iconv/head/contrib/nvi2/common/exf.c Sun Aug 14 13:37:38 2011 (r225110) +++ soc2011/zy/nvi-iconv/head/contrib/nvi2/common/exf.c Sun Aug 14 14:27:54 2011 (r225111) @@ -10,7 +10,7 @@ #include "config.h" #ifndef lint -static const char sccsid[] = "$Id: exf.c,v 10.52 2011/07/18 16:58:54 zy Exp $ (Berkeley) $Date: 2011/07/18 16:58:54 $"; +static const char sccsid[] = "$Id: exf.c,v 10.53 2011/07/20 00:38:28 zy Exp $ (Berkeley) $Date: 2011/07/20 00:38:28 $"; #endif /* not lint */ #include @@ -39,6 +39,7 @@ static int file_backup __P((SCR *, char *, char *)); static void file_cinit __P((SCR *)); +static void file_encinit __P((SCR *)); static void file_comment __P((SCR *)); static int file_spath __P((SCR *, FREF *, struct stat *, int *)); @@ -404,12 +405,12 @@ sp->ep = ep; sp->frp = frp; + /* Detect and set the file encoding */ + file_encinit(sp); + /* Set the initial cursor position, queue initial command. */ file_cinit(sp); - /* Report conversion errors again. */ - F_CLR(sp, SC_CONV_ERROR); - /* Redraw the screen from scratch, schedule a welcome message. */ F_SET(sp, SC_SCR_REFORMAT | SC_STATUS); @@ -721,7 +722,7 @@ free(ep->rcv_path); if (ep->rcv_mpath != NULL) free(ep->rcv_mpath); - if (ep->c_lp != NULL) + if (ep->c_blen > 0) free(ep->c_lp); free(ep); @@ -1207,12 +1208,63 @@ } if (estr) msgq_str(sp, M_SYSERR, estr, "%s"); + if (d != NULL) + free(d); if (bp != NULL) FREE_SPACE(sp, bp, blen); return (1); } /* + * file_encinit -- + * Read the first line and set the O_FILEENCODING. + */ +static void +file_encinit(SCR *sp) +{ +#if defined(USE_WIDECHAR) && defined(USE_ICONV) + size_t len; + char *p; + size_t blen = 0; + char buf[4096]; /* not need to be '\0'-terminated */ + recno_t ln = 1; + + while (!db_rget(sp, ln++, &p, &len)) { + if (blen + len > sizeof(buf)) + len = sizeof(buf) - blen; + memcpy(buf + blen, p, len); + blen += len; + if (blen == sizeof(buf)) + break; + else + buf[blen++] = '\n'; + } + + if (looks_utf8(buf, blen) > 1) + o_set(sp, O_FILEENCODING, OS_STRDUP, "utf-8", 0); + else { + int st = looks_utf16(buf, blen); + if (st > 0) { + char *np; + size_t nlen; + db_rget(sp, 1, &p, &len); + nlen = len-2; + GET_SPACE_GOTOC(sp, np, nlen, nlen); + memcpy(sp->ep->_bom, p, 2); + memcpy(np, p+2, len-2); + db_rset(sp, 1, np, len-2); /* store w/o the BOM */ + } + if (st == 1) + o_set(sp, O_FILEENCODING, OS_STRDUP, "utf-16le", 0); + else if (st == 2) + o_set(sp, O_FILEENCODING, OS_STRDUP, "utf-16be", 0); + } + /* Fallback to locale encoding */ +alloc_err:; +#endif +} + +/* * file_comment -- * Skip the first comment. */ Modified: soc2011/zy/nvi-iconv/head/contrib/nvi2/common/exf.h ============================================================================== --- soc2011/zy/nvi-iconv/head/contrib/nvi2/common/exf.h Sun Aug 14 13:37:38 2011 (r225110) +++ soc2011/zy/nvi-iconv/head/contrib/nvi2/common/exf.h Sun Aug 14 14:27:54 2011 (r225111) @@ -6,7 +6,7 @@ * * See the LICENSE file for redistribution information. * - * @(#)exf.h 10.7 (Berkeley) 7/9/96 + * $Id: exf.h,v 10.8 2011/08/13 17:59:41 zy Exp $ (Berkeley) $Date: 2011/08/13 17:59:41 $ */ /* Undo direction. */ /* @@ -17,6 +17,10 @@ int refcnt; /* Reference count. */ /* Underlying database state. */ + union { + uint16_t bom; /* Byte-order-mark */ + char _bom[2]; + }; DB *db; /* File db structure. */ CHAR_T *c_lp; /* Cached line. */ size_t c_len; /* Cached line length. */ Modified: soc2011/zy/nvi-iconv/head/contrib/nvi2/common/extern.h ============================================================================== --- soc2011/zy/nvi-iconv/head/contrib/nvi2/common/extern.h Sun Aug 14 13:37:38 2011 (r225110) +++ soc2011/zy/nvi-iconv/head/contrib/nvi2/common/extern.h Sun Aug 14 14:27:54 2011 (r225111) @@ -7,6 +7,8 @@ void text_lfree __P((TEXTH *)); void text_free __P((TEXT *)); int del __P((SCR *, MARK *, MARK *, int)); +int looks_utf8 __P((const char *, size_t)); +int looks_utf16 __P((const char *, size_t)); FREF *file_add __P((SCR *, char *)); int file_init __P((SCR *, FREF *, char *, int)); int file_end __P((SCR *, EXF *, int)); @@ -34,6 +36,8 @@ int db_set __P((SCR *, recno_t, CHAR_T *, size_t)); int db_exist __P((SCR *, recno_t)); int db_last __P((SCR *, recno_t *)); +int db_rget __P((SCR *, recno_t, char **, size_t *)); +int db_rset __P((SCR *, recno_t, char *, size_t)); void db_err __P((SCR *, recno_t)); int log_init __P((SCR *, EXF *)); int log_end __P((SCR *, EXF *)); Modified: soc2011/zy/nvi-iconv/head/contrib/nvi2/common/line.c ============================================================================== --- soc2011/zy/nvi-iconv/head/contrib/nvi2/common/line.c Sun Aug 14 13:37:38 2011 (r225110) +++ soc2011/zy/nvi-iconv/head/contrib/nvi2/common/line.c Sun Aug 14 14:27:54 2011 (r225111) @@ -10,7 +10,7 @@ #include "config.h" #ifndef lint -static const char sccsid[] = "$Id: line.c,v 10.24 2011/07/18 16:10:48 zy Exp $ (Berkeley) $Date: 2011/07/18 16:10:48 $"; +static const char sccsid[] = "$Id: line.c,v 10.26 2011/08/12 12:36:41 zy Exp $ (Berkeley) $Date: 2011/08/12 12:36:41 $"; #endif /* not lint */ #include @@ -580,6 +580,72 @@ } /* + * db_rget -- + * Retrieve a raw line from database. No cache, no conversion. + * + * PUBLIC: int db_rget __P((SCR *, recno_t, char **, size_t *)); + */ +int +db_rget( + SCR *sp, + recno_t lno, /* Line number. */ + char **pp, /* Pointer store. */ + size_t *lenp) /* Length store. */ +{ + DBT data, key; + EXF *ep; + + /* Check for no underlying file. */ + if ((ep = sp->ep) == NULL) + return (1); + + /* Get the line from the underlying database. */ + key.data = &lno; + key.size = sizeof(lno); + if (ep->db->get(ep->db, &key, &data, 0)) + /* We do not report error, and do not ensure the size! */ + return (1); + + if (lenp != NULL) + *lenp = data.size; + if (pp != NULL) + *pp = data.data; + return (0); +} + +/* + * db_rset -- + * Store a line in the file. No log, no conversion. + * + * PUBLIC: int db_rset __P((SCR *, recno_t, char *, size_t)); + */ +int +db_rset( + SCR *sp, + recno_t lno, + char *p, + size_t len) +{ + DBT data, key; + EXF *ep; + + /* Check for no underlying file. */ + if ((ep = sp->ep) == NULL) + return (1); + + /* Update file. */ + key.data = &lno; + key.size = sizeof(lno); + data.data = p; + data.size = len; + if (ep->db->put(ep->db, &key, &data, 0) == -1) + /* We do not report error, and do not ensure the size! */ + return (1); + + return (0); +} + +/* * db_err -- * Report a line error. * Modified: soc2011/zy/nvi-iconv/head/contrib/nvi2/ex/ex.h ============================================================================== --- soc2011/zy/nvi-iconv/head/contrib/nvi2/ex/ex.h Sun Aug 14 13:37:38 2011 (r225110) +++ soc2011/zy/nvi-iconv/head/contrib/nvi2/ex/ex.h Sun Aug 14 14:27:54 2011 (r225111) @@ -133,7 +133,7 @@ #define E_C_PRINT 0x01000 /* p flag. */ u_int16_t iflags; /* User input information. */ -#define __INUSE2 0x000007ff /* Same name space as EXCMDLIST. */ +#define __INUSE2 0x000004ff /* Same name space as EXCMDLIST. */ #define E_BLIGNORE 0x00000800 /* Ignore blank lines. */ #define E_NAMEDISCARD 0x00001000 /* Free/discard the name. */ #define E_NOAUTO 0x00002000 /* Don't do autoprint output. */ Modified: soc2011/zy/nvi-iconv/head/contrib/nvi2/ex/ex_write.c ============================================================================== --- soc2011/zy/nvi-iconv/head/contrib/nvi2/ex/ex_write.c Sun Aug 14 13:37:38 2011 (r225110) +++ soc2011/zy/nvi-iconv/head/contrib/nvi2/ex/ex_write.c Sun Aug 14 14:27:54 2011 (r225111) @@ -10,7 +10,7 @@ #include "config.h" #ifndef lint -static const char sccsid[] = "$Id: ex_write.c,v 10.38 2001/06/25 15:19:22 skimo Exp $ (Berkeley) $Date: 2001/06/25 15:19:22 $"; +static const char sccsid[] = "$Id: ex_write.c,v 10.39 2011/08/13 18:28:15 zy Exp $ (Berkeley) $Date: 2011/08/13 18:28:15 $"; #endif /* not lint */ #include @@ -25,6 +25,7 @@ #include #include #include +#include #include #include "../common/common.h" @@ -287,6 +288,7 @@ CHAR_T *p; char *f; size_t flen; + int isutf16; gp = sp->gp; fline = fm->lno; @@ -315,7 +317,12 @@ ccnt = 0; lcnt = 0; msg = "253|Writing..."; - if (tline != 0) + + isutf16 = !strncasecmp(O_STR(sp, O_FILEENCODING), "utf-16", 6); + + if (tline != 0) { + if (sp->ep->bom && fwrite(&sp->ep->bom, 2, 1, fp) != 1) + goto err; for (; fline <= tline; ++fline, ++lcnt) { /* Caller has to provide any interrupt message. */ if ((lcnt + 1) % INTERRUPT_CHECK == 0) { @@ -333,10 +340,13 @@ if (fwrite(f, 1, flen, fp) != flen) goto err; ccnt += len; + if (isutf16 && putc('\0', fp) != '\0') + break; /* UTF-16 uses '000a' as EOL */ if (putc('\n', fp) != '\n') break; ++ccnt; } + } if (fflush(fp)) goto err; Modified: soc2011/zy/nvi-iconv/head/contrib/nvi2/vi/vi.c ============================================================================== --- soc2011/zy/nvi-iconv/head/contrib/nvi2/vi/vi.c Sun Aug 14 13:37:38 2011 (r225110) +++ soc2011/zy/nvi-iconv/head/contrib/nvi2/vi/vi.c Sun Aug 14 14:27:54 2011 (r225111) @@ -403,6 +403,7 @@ if (F_ISSET(gp, G_SRESTART) || F_ISSET(sp, SC_EX)) { *spp = sp; v_dtoh(sp); + gp->scr_discard(sp, NULL); break; } } @@ -1015,6 +1016,9 @@ } CIRCLEQ_REMOVE(&gp->dq, tsp, q); CIRCLEQ_INSERT_TAIL(&gp->hq, tsp, q); + /* XXXX Change if hidden screens per window */ + tsp->gp = 0; + gp->scr_discard(tsp, NULL); } /* Move current screen back to the display queue. */ Modified: soc2011/zy/nvi-iconv/head/contrib/nvi2/vi/vs_refresh.c ============================================================================== --- soc2011/zy/nvi-iconv/head/contrib/nvi2/vi/vs_refresh.c Sun Aug 14 13:37:38 2011 (r225110) +++ soc2011/zy/nvi-iconv/head/contrib/nvi2/vi/vs_refresh.c Sun Aug 14 14:27:54 2011 (r225111) @@ -345,7 +345,7 @@ tmp.lno = LNO; tmp.coff = HMAP->coff; tmp.soff = 1; - lcnt = vs_sm_nlines(sp, &tmp, lastline+1, sp->t_rows); + lcnt = vs_sm_nlines(sp, &tmp, lastline, sp->t_rows); if (lcnt < HALFTEXT(sp)) { if (vs_sm_fill(sp, lastline, P_BOTTOM)) return (1); Modified: soc2011/zy/nvi-iconv/head/usr.bin/vi/Makefile ============================================================================== --- soc2011/zy/nvi-iconv/head/usr.bin/vi/Makefile Sun Aug 14 13:37:38 2011 (r225110) +++ soc2011/zy/nvi-iconv/head/usr.bin/vi/Makefile Sun Aug 14 14:27:54 2011 (r225111) @@ -57,8 +57,9 @@ SRCS+= cl_bsd.c cl_funcs.c cl_main.c cl_read.c cl_screen.c cl_term.c # General sources. -SRCS+= cut.c conv.c delete.c exf.c key.c line.c log.c main.c mark.c msg.c \ - options.c options_f.c put.c screen.c search.c seq.c recover.c util.c +SRCS+= cut.c conv.c delete.c encoding.c exf.c key.c line.c log.c main.c \ + mark.c msg.c options.c options_f.c put.c screen.c search.c seq.c \ + recover.c util.c # Ex source. SRCS+= ex.c ex_abbrev.c ex_append.c ex_args.c ex_argv.c ex_at.c ex_bang.c \