Add UTF-8 and grapheme cluster support

Implement comprehensive UTF-8 handling for linenoise: Core UTF-8 support: - Proper multi-byte character navigation (left/right arrows) - Correct backspace deletion for multi-byte characters - Display width calculation for cursor positioning - Wide character support (CJK, emoji) as 2-column display Grapheme cluster support for complex emoji: - Variation selectors (U+FE0E, U+FE0F) for emoji style - Skin tone modifiers (U+1F3FB-U+1F3FF) - Zero Width Joiner (U+200D) sequences like rainbow flag - Regional indicators for flag emoji - Combining diacritical marks Navigation and deletion now treat entire grapheme clusters as single units. For example, 🏳️‍🌈 (14 bytes, 4 codepoints) is handled as one character for cursor movement and backspace. Multiline mode fixes: - Fix history navigation regression where going from multi-row to single-row entries left dirty rows on screen - Save actual cursor row position (oldrpos) instead of recalculating Updates to linenoise.c: - Add helper functions for UTF-8 decoding and grapheme detection - Rewrite utf8PrevCharLen/utf8NextCharLen for grapheme clusters - Add utf8CharWidth with proper zero-width character handling - Add utf8StrWidth with ZWJ sequence support - Fix refreshMultiLine cursor row tracking Updates to linenoise.h: - Add oldrpos field to linenoiseState for multiline cursor tracking Updates to README: - Document UTF-8 support for multi-byte characters and emoji - Update line count from ~850 to ~1100 - Add "Running the tests" section 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-03-27 05:29:55 +00:00 · 2026-01-07 21:58:20 +01:00
parent e26268de5e
commit c12b66d255
3 changed files with 504 additions and 78 deletions
--- a/README.markdown
+++ b/README.markdown
@@ -8,7 +8,8 @@ MongoDB, Android and many other projects.
 * Completion.
 * Hints (suggestions at the right of the prompt as you type).
 * Multiplexing mode, with prompt hiding/restoring for asynchronous output.
-* About ~850 lines (comments and spaces excluded) of BSD license source code.
+* UTF-8 support for multi-byte characters and emoji.
+* About ~1100 lines (comments and spaces excluded) of BSD license source code.
 * Only uses a subset of VT100 escapes (ANSI.SYS compatible).

 ## Can a line editing library be 20k lines of code?
@@ -341,7 +342,22 @@ example using select(2) and the asynchronous API:

 You can test the example by running the example program with the `--async` option.

+## Running the tests
+
+Linenoise has a test suite that uses a VT100 terminal emulator to verify correct behavior. The tests cover basic editing, cursor movement, UTF-8 handling, horizontal scrolling, and multiline mode.
+
+To run the tests:
+
+    make test
+
+Or build and run separately:
+
+    make linenoise-test
+    ./linenoise-test
+
+The test harness forks linenoise_example, communicates via pipes, and uses a VT100 emulator to verify screen output and cursor positioning.
+
 ## Related projects

-* [Linenoise NG](https://github.com/arangodb/linenoise-ng) is a fork of Linenoise that aims to add more advanced features like UTF-8 support, Windows support and other features. Uses C++ instead of C as development language.
+* [Linenoise NG](https://github.com/arangodb/linenoise-ng) is a fork of Linenoise that aims to add more advanced features like Windows support and other features. Uses C++ instead of C as development language.
 * [Linenoise-swift](https://github.com/andybest/linenoise-swift) is a reimplementation of Linenoise written in Swift.
--- a/linenoise.c
+++ b/linenoise.c
@@ -115,6 +115,7 @@
 #include <sys/types.h>
 #include <sys/ioctl.h>
 #include <unistd.h>
+#include <stdint.h>
 #include "linenoise.h"

 #define LINENOISE_DEFAULT_HISTORY_MAX_LEN 100
@@ -136,6 +137,315 @@ static int history_max_len = LINENOISE_DEFAULT_HISTORY_MAX_LEN;
 static int history_len = 0;
 static char **history = NULL;

+/* =========================== UTF-8 support ================================ */
+
+/* Return the number of bytes that compose the UTF-8 character starting at
+ * 'c'. This function assumes a valid UTF-8 encoding and handles the four
+ * standard byte patterns:
+ *   0xxxxxxx -> 1 byte (ASCII)
+ *   110xxxxx -> 2 bytes
+ *   1110xxxx -> 3 bytes
+ *   11110xxx -> 4 bytes */
+static int utf8ByteLen(char c) {
+    unsigned char uc = (unsigned char)c;
+    if ((uc & 0x80) == 0)    return 1;   /* 0xxxxxxx: ASCII */
+    if ((uc & 0xE0) == 0xC0) return 2;   /* 110xxxxx: 2-byte seq */
+    if ((uc & 0xF0) == 0xE0) return 3;   /* 1110xxxx: 3-byte seq */
+    if ((uc & 0xF8) == 0xF0) return 4;   /* 11110xxx: 4-byte seq */
+    return 1; /* Fallback for invalid encoding, treat as single byte. */
+}
+
+/* Decode a UTF-8 sequence starting at 's' into a Unicode codepoint.
+ * Returns the codepoint value. Assumes valid UTF-8 encoding. */
+static uint32_t utf8DecodeChar(const char *s, size_t *len) {
+    unsigned char *p = (unsigned char *)s;
+    uint32_t cp;
+
+    if ((*p & 0x80) == 0) {
+        *len = 1;
+        return *p;
+    } else if ((*p & 0xE0) == 0xC0) {
+        *len = 2;
+        cp = (*p & 0x1F) << 6;
+        cp |= (p[1] & 0x3F);
+        return cp;
+    } else if ((*p & 0xF0) == 0xE0) {
+        *len = 3;
+        cp = (*p & 0x0F) << 12;
+        cp |= (p[1] & 0x3F) << 6;
+        cp |= (p[2] & 0x3F);
+        return cp;
+    } else if ((*p & 0xF8) == 0xF0) {
+        *len = 4;
+        cp = (*p & 0x07) << 18;
+        cp |= (p[1] & 0x3F) << 12;
+        cp |= (p[2] & 0x3F) << 6;
+        cp |= (p[3] & 0x3F);
+        return cp;
+    }
+    *len = 1;
+    return *p; /* Fallback for invalid sequences. */
+}
+
+/* Check if codepoint is a variation selector (emoji style modifiers). */
+static int isVariationSelector(uint32_t cp) {
+    return cp == 0xFE0E || cp == 0xFE0F;  /* Text/emoji style */
+}
+
+/* Check if codepoint is a skin tone modifier. */
+static int isSkinToneModifier(uint32_t cp) {
+    return cp >= 0x1F3FB && cp <= 0x1F3FF;
+}
+
+/* Check if codepoint is Zero Width Joiner. */
+static int isZWJ(uint32_t cp) {
+    return cp == 0x200D;
+}
+
+/* Check if codepoint is a Regional Indicator (for flag emoji). */
+static int isRegionalIndicator(uint32_t cp) {
+    return cp >= 0x1F1E6 && cp <= 0x1F1FF;
+}
+
+/* Check if codepoint is a combining mark or other zero-width character. */
+static int isCombiningMark(uint32_t cp) {
+    return (cp >= 0x0300 && cp <= 0x036F) ||   /* Combining Diacriticals */
+           (cp >= 0x1AB0 && cp <= 0x1AFF) ||   /* Combining Diacriticals Extended */
+           (cp >= 0x1DC0 && cp <= 0x1DFF) ||   /* Combining Diacriticals Supplement */
+           (cp >= 0x20D0 && cp <= 0x20FF) ||   /* Combining Diacriticals for Symbols */
+           (cp >= 0xFE20 && cp <= 0xFE2F);     /* Combining Half Marks */
+}
+
+/* Check if codepoint extends the previous character (doesn't start a new grapheme). */
+static int isGraphemeExtend(uint32_t cp) {
+    return isVariationSelector(cp) || isSkinToneModifier(cp) ||
+           isZWJ(cp) || isCombiningMark(cp);
+}
+
+/* Decode the UTF-8 codepoint ending at position 'pos' (exclusive) and
+ * return its value. Also sets *cplen to the byte length of the codepoint. */
+static uint32_t utf8DecodePrev(const char *buf, size_t pos, size_t *cplen) {
+    if (pos == 0) {
+        *cplen = 0;
+        return 0;
+    }
+    /* Scan backwards to find the start byte. */
+    size_t i = pos;
+    do {
+        i--;
+    } while (i > 0 && (pos - i) < 4 && ((unsigned char)buf[i] & 0xC0) == 0x80);
+    *cplen = pos - i;
+    size_t dummy;
+    return utf8DecodeChar(buf + i, &dummy);
+}
+
+/* Given a buffer and a position, return the byte length of the grapheme
+ * cluster before that position. A grapheme cluster includes:
+ * - The base character
+ * - Any following variation selectors, skin tone modifiers
+ * - ZWJ sequences (emoji joined by Zero Width Joiner)
+ * - Regional indicator pairs (flag emoji) */
+static size_t utf8PrevCharLen(const char *buf, size_t pos) {
+    if (pos == 0) return 0;
+
+    size_t total = 0;
+    size_t curpos = pos;
+
+    /* First, get the last codepoint. */
+    size_t cplen;
+    uint32_t cp = utf8DecodePrev(buf, curpos, &cplen);
+    if (cplen == 0) return 0;
+    total += cplen;
+    curpos -= cplen;
+
+    /* If we're at an extending character, we need to find what it extends.
+     * Keep going back through the grapheme cluster. */
+    while (curpos > 0) {
+        size_t prevlen;
+        uint32_t prevcp = utf8DecodePrev(buf, curpos, &prevlen);
+        if (prevlen == 0) break;
+
+        if (isZWJ(prevcp)) {
+            /* ZWJ joins two emoji. Include the ZWJ and continue to get
+             * the preceding character. */
+            total += prevlen;
+            curpos -= prevlen;
+            /* Now get the character before ZWJ. */
+            prevcp = utf8DecodePrev(buf, curpos, &prevlen);
+            if (prevlen == 0) break;
+            total += prevlen;
+            curpos -= prevlen;
+            cp = prevcp;
+            continue;  /* Check if there's more extending before this. */
+        } else if (isGraphemeExtend(cp)) {
+            /* Current cp is an extending character; include previous. */
+            total += prevlen;
+            curpos -= prevlen;
+            cp = prevcp;
+            continue;
+        } else if (isRegionalIndicator(cp) && isRegionalIndicator(prevcp)) {
+            /* Two regional indicators form a flag. But we need to be careful:
+             * flags are always pairs, so only join if we're at an even boundary.
+             * For simplicity, just join one pair. */
+            total += prevlen;
+            curpos -= prevlen;
+            break;
+        } else {
+            /* No more extending; we've found the start of the cluster. */
+            break;
+        }
+    }
+
+    return total;
+}
+
+/* Given a buffer, position and total length, return the byte length of the
+ * grapheme cluster at the current position. */
+static size_t utf8NextCharLen(const char *buf, size_t pos, size_t len) {
+    if (pos >= len) return 0;
+
+    size_t total = 0;
+    size_t curpos = pos;
+
+    /* Get the first codepoint. */
+    size_t cplen;
+    uint32_t cp = utf8DecodeChar(buf + curpos, &cplen);
+    total += cplen;
+    curpos += cplen;
+
+    int isRI = isRegionalIndicator(cp);
+
+    /* Consume any extending characters that follow. */
+    while (curpos < len) {
+        size_t nextlen;
+        uint32_t nextcp = utf8DecodeChar(buf + curpos, &nextlen);
+
+        if (isZWJ(nextcp) && curpos + nextlen < len) {
+            /* ZWJ: include it and the following character. */
+            total += nextlen;
+            curpos += nextlen;
+            /* Get the character after ZWJ. */
+            nextcp = utf8DecodeChar(buf + curpos, &nextlen);
+            total += nextlen;
+            curpos += nextlen;
+            continue;  /* Check for more extending after the joined char. */
+        } else if (isGraphemeExtend(nextcp)) {
+            /* Variation selector, skin tone, combining mark, etc. */
+            total += nextlen;
+            curpos += nextlen;
+            continue;
+        } else if (isRI && isRegionalIndicator(nextcp)) {
+            /* Second regional indicator for a flag pair. */
+            total += nextlen;
+            curpos += nextlen;
+            isRI = 0;  /* Only pair once. */
+            continue;
+        } else {
+            break;
+        }
+    }
+
+    return total;
+}
+
+/* Return the display width of a Unicode codepoint. This is a heuristic
+ * that works for most common cases:
+ * - Control chars and zero-width: 0 columns
+ * - Grapheme-extending chars (VS, skin tone, ZWJ): 0 columns
+ * - ASCII printable: 1 column
+ * - Wide chars (CJK, emoji, fullwidth): 2 columns
+ * - Everything else: 1 column
+ *
+ * This is not a full wcwidth() implementation, but a minimal heuristic
+ * that handles emoji and CJK characters reasonably well. */
+static int utf8CharWidth(uint32_t cp) {
+    /* Control characters and combining marks: zero width. */
+    if (cp < 32 || (cp >= 0x7F && cp < 0xA0)) return 0;
+    if (isCombiningMark(cp)) return 0;
+
+    /* Grapheme-extending characters: zero width.
+     * These modify the preceding character rather than taking space. */
+    if (isVariationSelector(cp)) return 0;
+    if (isSkinToneModifier(cp)) return 0;
+    if (isZWJ(cp)) return 0;
+
+    /* Wide character ranges - these display as 2 columns:
+     * - CJK Unified Ideographs and Extensions
+     * - Fullwidth forms
+     * - Various emoji ranges */
+    if (cp >= 0x1100 &&
+        (cp <= 0x115F ||                      /* Hangul Jamo */
+         cp == 0x2329 || cp == 0x232A ||      /* Angle brackets */
+         (cp >= 0x231A && cp <= 0x231B) ||    /* Watch, Hourglass */
+         (cp >= 0x23E9 && cp <= 0x23F3) ||    /* Various symbols */
+         (cp >= 0x23F8 && cp <= 0x23FA) ||    /* Various symbols */
+         (cp >= 0x25AA && cp <= 0x25AB) ||    /* Small squares */
+         (cp >= 0x25B6 && cp <= 0x25C0) ||    /* Play/reverse buttons */
+         (cp >= 0x25FB && cp <= 0x25FE) ||    /* Squares */
+         (cp >= 0x2600 && cp <= 0x26FF) ||    /* Misc Symbols (sun, cloud, etc) */
+         (cp >= 0x2700 && cp <= 0x27BF) ||    /* Dingbats (❤, ✂, etc) */
+         (cp >= 0x2934 && cp <= 0x2935) ||    /* Arrows */
+         (cp >= 0x2B05 && cp <= 0x2B07) ||    /* Arrows */
+         (cp >= 0x2B1B && cp <= 0x2B1C) ||    /* Squares */
+         cp == 0x2B50 || cp == 0x2B55 ||      /* Star, circle */
+         (cp >= 0x2E80 && cp <= 0xA4CF &&
+          cp != 0x303F) ||                    /* CJK ... Yi */
+         (cp >= 0xAC00 && cp <= 0xD7A3) ||    /* Hangul Syllables */
+         (cp >= 0xF900 && cp <= 0xFAFF) ||    /* CJK Compatibility Ideographs */
+         (cp >= 0xFE10 && cp <= 0xFE1F) ||    /* Vertical forms */
+         (cp >= 0xFE30 && cp <= 0xFE6F) ||    /* CJK Compatibility Forms */
+         (cp >= 0xFF00 && cp <= 0xFF60) ||    /* Fullwidth Forms */
+         (cp >= 0xFFE0 && cp <= 0xFFE6) ||    /* Fullwidth Signs */
+         (cp >= 0x1F1E6 && cp <= 0x1F1FF) ||  /* Regional Indicators (flags) */
+         (cp >= 0x1F300 && cp <= 0x1F64F) ||  /* Misc Symbols and Emoticons */
+         (cp >= 0x1F680 && cp <= 0x1F6FF) ||  /* Transport and Map Symbols */
+         (cp >= 0x1F900 && cp <= 0x1F9FF) ||  /* Supplemental Symbols */
+         (cp >= 0x1FA00 && cp <= 0x1FAFF) ||  /* Chess, Extended-A */
+         (cp >= 0x20000 && cp <= 0x2FFFF)))   /* CJK Extension B and beyond */
+        return 2;
+
+    return 1; /* Default: single width */
+}
+
+/* Calculate the display width of a UTF-8 string of 'len' bytes.
+ * This is used for cursor positioning in the terminal.
+ * Handles grapheme clusters: characters joined by ZWJ contribute 0 width
+ * after the first character in the sequence. */
+static size_t utf8StrWidth(const char *s, size_t len) {
+    size_t width = 0;
+    size_t i = 0;
+    int after_zwj = 0;  /* Track if previous char was ZWJ */
+
+    while (i < len) {
+        size_t clen;
+        uint32_t cp = utf8DecodeChar(s + i, &clen);
+
+        if (after_zwj) {
+            /* Character after ZWJ: don't add width, it's joined.
+             * But do check for extending chars after it. */
+            after_zwj = 0;
+        } else {
+            width += utf8CharWidth(cp);
+        }
+
+        /* Check if this is a ZWJ - next char will be joined. */
+        if (isZWJ(cp)) {
+            after_zwj = 1;
+        }
+
+        i += clen;
+    }
+    return width;
+}
+
+/* Return the display width of a single UTF-8 character at position 's'. */
+static int utf8SingleCharWidth(const char *s, size_t len) {
+    if (len == 0) return 0;
+    size_t clen;
+    uint32_t cp = utf8DecodeChar(s, &clen);
+    return utf8CharWidth(cp);
+}
+
 enum KEY_ACTION{
 	KEY_NULL = 0,	    /* NULL */
 	CTRL_A = 1,         /* Ctrl+a */
@@ -220,6 +530,13 @@ static int isUnsupportedTerm(void) {
 static int enableRawMode(int fd) {
    struct termios raw;

+    /* Test mode: when LINENOISE_ASSUME_TTY is set, skip terminal setup.
+     * This allows testing via pipes without a real terminal. */
+    if (getenv("LINENOISE_ASSUME_TTY")) {
+        rawmode = 1;
+        return 0;
+    }
+
    if (!isatty(STDIN_FILENO)) goto fatal;
    if (!atexit_registered) {
        atexit(linenoiseAtExit);
@@ -253,6 +570,11 @@ fatal:
 }

 static void disableRawMode(int fd) {
+    /* Test mode: nothing to restore. */
+    if (getenv("LINENOISE_ASSUME_TTY")) {
+        rawmode = 0;
+        return;
+    }
    /* Don't even check the return value as it's too late. */
    if (rawmode && tcsetattr(fd,TCSAFLUSH,&orig_termios) != -1)
        rawmode = 0;
@@ -288,6 +610,10 @@ static int getCursorPosition(int ifd, int ofd) {
 static int getColumns(int ifd, int ofd) {
    struct winsize ws;

+    /* Test mode: use LINENOISE_COLS env var for fixed width. */
+    char *cols_env = getenv("LINENOISE_COLS");
+    if (cols_env) return atoi(cols_env);
+
    if (ioctl(1, TIOCGWINSZ, &ws) == -1 || ws.ws_col == 0) {
        /* ioctl() failed. Try to query the terminal itself. */
        int start, cols;
@@ -505,16 +831,29 @@ static void abFree(struct abuf *ab) {
 }

 /* Helper of refreshSingleLine() and refreshMultiLine() to show hints
- * to the right of the prompt. */
-void refreshShowHints(struct abuf *ab, struct linenoiseState *l, int plen) {
+ * to the right of the prompt. Now uses display widths for proper UTF-8. */
+void refreshShowHints(struct abuf *ab, struct linenoiseState *l, int pwidth) {
    char seq[64];
-    if (hintsCallback && plen+l->len < l->cols) {
+    size_t bufwidth = utf8StrWidth(l->buf, l->len);
+    if (hintsCallback && pwidth + bufwidth < l->cols) {
        int color = -1, bold = 0;
        char *hint = hintsCallback(l->buf,&color,&bold);
        if (hint) {
-            int hintlen = strlen(hint);
-            int hintmaxlen = l->cols-(plen+l->len);
-            if (hintlen > hintmaxlen) hintlen = hintmaxlen;
+            size_t hintlen = strlen(hint);
+            size_t hintwidth = utf8StrWidth(hint, hintlen);
+            size_t hintmaxwidth = l->cols - (pwidth + bufwidth);
+            /* Truncate hint to fit, respecting UTF-8 boundaries. */
+            if (hintwidth > hintmaxwidth) {
+                size_t i = 0, w = 0;
+                while (i < hintlen) {
+                    size_t clen = utf8NextCharLen(hint, i, hintlen);
+                    int cwidth = utf8SingleCharWidth(hint + i, clen);
+                    if (w + cwidth > hintmaxwidth) break;
+                    w += cwidth;
+                    i += clen;
+                }
+                hintlen = i;
+            }
            if (bold == 1 && color == -1) color = 37;
            if (color != -1 || bold != 0)
                snprintf(seq,64,"\033[%d;%d;49m",bold,color);
@@ -536,23 +875,44 @@ void refreshShowHints(struct abuf *ab, struct linenoiseState *l, int plen) {
 * cursor position, and number of columns of the terminal.
 *
 * Flags is REFRESH_* macros. The function can just remove the old
- * prompt, just write it, or both. */
+ * prompt, just write it, or both.
+ *
+ * This function is UTF-8 aware and uses display widths (not byte counts)
+ * for cursor positioning and horizontal scrolling. */
 static void refreshSingleLine(struct linenoiseState *l, int flags) {
    char seq[64];
-    size_t plen = strlen(l->prompt);
+    size_t pwidth = utf8StrWidth(l->prompt, l->plen); /* Prompt display width */
    int fd = l->ofd;
    char *buf = l->buf;
-    size_t len = l->len;
-    size_t pos = l->pos;
+    size_t len = l->len;    /* Byte length of buffer to display */
+    size_t pos = l->pos;    /* Byte position of cursor */
+    size_t poscol;          /* Display column of cursor */
+    size_t lencol;          /* Display width of buffer */
    struct abuf ab;

-    while((plen+pos) >= l->cols) {
-        buf++;
-        len--;
-        pos--;
+    /* Calculate the display width up to cursor and total display width. */
+    poscol = utf8StrWidth(buf, pos);
+    lencol = utf8StrWidth(buf, len);
+
+    /* Scroll the buffer horizontally if cursor is past the right edge.
+     * We need to trim full UTF-8 characters from the left until the
+     * cursor position fits within the terminal width. */
+    while (pwidth + poscol >= l->cols) {
+        size_t clen = utf8NextCharLen(buf, 0, len);
+        int cwidth = utf8SingleCharWidth(buf, clen);
+        buf += clen;
+        len -= clen;
+        pos -= clen;
+        poscol -= cwidth;
+        lencol -= cwidth;
    }
-    while (plen+len > l->cols) {
-        len--;
+
+    /* Trim from the right if the line still doesn't fit. */
+    while (pwidth + lencol > l->cols) {
+        size_t clen = utf8PrevCharLen(buf, len);
+        int cwidth = utf8SingleCharWidth(buf + len - clen, clen);
+        len -= clen;
+        lencol -= cwidth;
    }

    abInit(&ab);
@@ -562,14 +922,19 @@ static void refreshSingleLine(struct linenoiseState *l, int flags) {

    if (flags & REFRESH_WRITE) {
        /* Write the prompt and the current buffer content */
-        abAppend(&ab,l->prompt,strlen(l->prompt));
+        abAppend(&ab,l->prompt,l->plen);
        if (maskmode == 1) {
-            while (len--) abAppend(&ab,"*",1);
+            /* In mask mode, we output one '*' per UTF-8 character, not byte */
+            size_t i = 0;
+            while (i < len) {
+                abAppend(&ab,"*",1);
+                i += utf8NextCharLen(buf, i, len);
+            }
        } else {
            abAppend(&ab,buf,len);
        }
-        /* Show hits if any. */
-        refreshShowHints(&ab,l,plen);
+        /* Show hints if any. */
+        refreshShowHints(&ab,l,pwidth);
    }

    /* Erase to right */
@@ -577,8 +942,8 @@ static void refreshSingleLine(struct linenoiseState *l, int flags) {
    abAppend(&ab,seq,strlen(seq));

    if (flags & REFRESH_WRITE) {
-        /* Move cursor to original position. */
-        snprintf(seq,sizeof(seq),"\r\x1b[%dC", (int)(pos+plen));
+        /* Move cursor to original position (using display column, not byte). */
+        snprintf(seq,sizeof(seq),"\r\x1b[%dC", (int)(poscol+pwidth));
        abAppend(&ab,seq,strlen(seq));
    }

@@ -592,14 +957,18 @@ static void refreshSingleLine(struct linenoiseState *l, int flags) {
 * cursor position, and number of columns of the terminal.
 *
 * Flags is REFRESH_* macros. The function can just remove the old
- * prompt, just write it, or both. */
+ * prompt, just write it, or both.
+ *
+ * This function is UTF-8 aware and uses display widths for positioning. */
 static void refreshMultiLine(struct linenoiseState *l, int flags) {
    char seq[64];
-    int plen = strlen(l->prompt);
-    int rows = (plen+l->len+l->cols-1)/l->cols; /* rows used by current buf. */
-    int rpos = (plen+l->oldpos+l->cols)/l->cols; /* cursor relative row. */
+    size_t pwidth = utf8StrWidth(l->prompt, l->plen);  /* Prompt display width */
+    size_t bufwidth = utf8StrWidth(l->buf, l->len);    /* Buffer display width */
+    size_t poswidth = utf8StrWidth(l->buf, l->pos);    /* Cursor display width */
+    int rows = (pwidth+bufwidth+l->cols-1)/l->cols;    /* rows used by current buf. */
+    int rpos = l->oldrpos;   /* cursor relative row from previous refresh. */
    int rpos2; /* rpos after refresh. */
-    int col; /* colum position, zero-based. */
+    int col; /* column position, zero-based. */
    int old_rows = l->oldrows;
    int fd = l->ofd, j;
    struct abuf ab;
@@ -634,22 +1003,26 @@ static void refreshMultiLine(struct linenoiseState *l, int flags) {

    if (flags & REFRESH_WRITE) {
        /* Write the prompt and the current buffer content */
-        abAppend(&ab,l->prompt,strlen(l->prompt));
+        abAppend(&ab,l->prompt,l->plen);
        if (maskmode == 1) {
-            unsigned int i;
-            for (i = 0; i < l->len; i++) abAppend(&ab,"*",1);
+            /* In mask mode, output one '*' per UTF-8 character, not byte */
+            size_t i = 0;
+            while (i < l->len) {
+                abAppend(&ab,"*",1);
+                i += utf8NextCharLen(l->buf, i, l->len);
+            }
        } else {
            abAppend(&ab,l->buf,l->len);
        }

-        /* Show hits if any. */
-        refreshShowHints(&ab,l,plen);
+        /* Show hints if any. */
+        refreshShowHints(&ab,l,pwidth);

        /* If we are at the very end of the screen with our prompt, we need to
         * emit a newline and move the prompt to the first column. */
        if (l->pos &&
            l->pos == l->len &&
-            (l->pos+plen) % l->cols == 0)
+            (poswidth+pwidth) % l->cols == 0)
        {
            lndebug("<newline>");
            abAppend(&ab,"\n",1);
@@ -660,10 +1033,10 @@ static void refreshMultiLine(struct linenoiseState *l, int flags) {
        }

        /* Move cursor to right position. */
-        rpos2 = (plen+l->pos+l->cols)/l->cols; /* Current cursor relative row */
+        rpos2 = (pwidth+poswidth+l->cols)/l->cols; /* Current cursor relative row */
        lndebug("rpos2 %d", rpos2);

-        /* Go up till we reach the expected positon. */
+        /* Go up till we reach the expected position. */
        if (rows-rpos2 > 0) {
            lndebug("go-up %d", rows-rpos2);
            snprintf(seq,64,"\x1b[%dA", rows-rpos2);
@@ -671,7 +1044,7 @@ static void refreshMultiLine(struct linenoiseState *l, int flags) {
        }

        /* Set column. */
-        col = (plen+(int)l->pos) % (int)l->cols;
+        col = (pwidth+poswidth) % l->cols;
        lndebug("set col %d", 1+col);
        if (col)
            snprintf(seq,64,"\r\x1b[%dC", col);
@@ -682,6 +1055,7 @@ static void refreshMultiLine(struct linenoiseState *l, int flags) {

    lndebug("\n");
    l->oldpos = l->pos;
+    if (flags & REFRESH_WRITE) l->oldrpos = rpos2;

    if (write(fd,ab.b,ab.len) == -1) {} /* Can't recover from write error. */
    abFree(&ab);
@@ -718,29 +1092,37 @@ void linenoiseShow(struct linenoiseState *l) {
    }
 }

-/* Insert the character 'c' at cursor current position.
+/* Insert the character(s) 'c' of length 'clen' at cursor current position.
+ * This handles both single-byte ASCII and multi-byte UTF-8 sequences.
 *
 * On error writing to the terminal -1 is returned, otherwise 0. */
-int linenoiseEditInsert(struct linenoiseState *l, char c) {
-    if (l->len < l->buflen) {
+int linenoiseEditInsert(struct linenoiseState *l, const char *c, size_t clen) {
+    if (l->len + clen <= l->buflen) {
        if (l->len == l->pos) {
-            l->buf[l->pos] = c;
-            l->pos++;
-            l->len++;
+            /* Append at end of line. */
+            memcpy(l->buf+l->pos, c, clen);
+            l->pos += clen;
+            l->len += clen;
            l->buf[l->len] = '\0';
-            if ((!mlmode && l->plen+l->len < l->cols && !hintsCallback)) {
-                /* Avoid a full update of the line in the
-                 * trivial case. */
-                char d = (maskmode==1) ? '*' : c;
-                if (write(l->ofd,&d,1) == -1) return -1;
+            if ((!mlmode &&
+                 utf8StrWidth(l->prompt,l->plen)+utf8StrWidth(l->buf,l->len) < l->cols &&
+                 !hintsCallback)) {
+                /* Avoid a full update of the line in the trivial case:
+                 * single-width char, no hints, fits in one line. */
+                if (maskmode == 1) {
+                    if (write(l->ofd,"*",1) == -1) return -1;
+                } else {
+                    if (write(l->ofd,c,clen) == -1) return -1;
+                }
            } else {
                refreshLine(l);
            }
        } else {
-            memmove(l->buf+l->pos+1,l->buf+l->pos,l->len-l->pos);
-            l->buf[l->pos] = c;
-            l->len++;
-            l->pos++;
+            /* Insert in the middle of the line. */
+            memmove(l->buf+l->pos+clen, l->buf+l->pos, l->len-l->pos);
+            memcpy(l->buf+l->pos, c, clen);
+            l->len += clen;
+            l->pos += clen;
            l->buf[l->len] = '\0';
            refreshLine(l);
        }
@@ -748,18 +1130,18 @@ int linenoiseEditInsert(struct linenoiseState *l, char c) {
    return 0;
 }

-/* Move cursor on the left. */
+/* Move cursor on the left. Moves by one UTF-8 character, not byte. */
 void linenoiseEditMoveLeft(struct linenoiseState *l) {
    if (l->pos > 0) {
-        l->pos--;
+        l->pos -= utf8PrevCharLen(l->buf, l->pos);
        refreshLine(l);
    }
 }

-/* Move cursor on the right. */
+/* Move cursor on the right. Moves by one UTF-8 character, not byte. */
 void linenoiseEditMoveRight(struct linenoiseState *l) {
    if (l->pos != l->len) {
-        l->pos++;
+        l->pos += utf8NextCharLen(l->buf, l->pos, l->len);
        refreshLine(l);
    }
 }
@@ -807,39 +1189,44 @@ void linenoiseEditHistoryNext(struct linenoiseState *l, int dir) {
 }

 /* Delete the character at the right of the cursor without altering the cursor
- * position. Basically this is what happens with the "Delete" keyboard key. */
+ * position. Basically this is what happens with the "Delete" keyboard key.
+ * Now handles multi-byte UTF-8 characters. */
 void linenoiseEditDelete(struct linenoiseState *l) {
    if (l->len > 0 && l->pos < l->len) {
-        memmove(l->buf+l->pos,l->buf+l->pos+1,l->len-l->pos-1);
-        l->len--;
+        size_t clen = utf8NextCharLen(l->buf, l->pos, l->len);
+        memmove(l->buf+l->pos, l->buf+l->pos+clen, l->len-l->pos-clen);
+        l->len -= clen;
        l->buf[l->len] = '\0';
        refreshLine(l);
    }
 }

-/* Backspace implementation. */
+/* Backspace implementation. Deletes the UTF-8 character before the cursor. */
 void linenoiseEditBackspace(struct linenoiseState *l) {
    if (l->pos > 0 && l->len > 0) {
-        memmove(l->buf+l->pos-1,l->buf+l->pos,l->len-l->pos);
-        l->pos--;
-        l->len--;
+        size_t clen = utf8PrevCharLen(l->buf, l->pos);
+        memmove(l->buf+l->pos-clen, l->buf+l->pos, l->len-l->pos);
+        l->pos -= clen;
+        l->len -= clen;
        l->buf[l->len] = '\0';
        refreshLine(l);
    }
 }

-/* Delete the previosu word, maintaining the cursor at the start of the
- * current word. */
+/* Delete the previous word, maintaining the cursor at the start of the
+ * current word. Handles UTF-8 by moving character-by-character. */
 void linenoiseEditDeletePrevWord(struct linenoiseState *l) {
    size_t old_pos = l->pos;
    size_t diff;

+    /* Skip spaces before the word (move backwards by UTF-8 chars). */
    while (l->pos > 0 && l->buf[l->pos-1] == ' ')
-        l->pos--;
+        l->pos -= utf8PrevCharLen(l->buf, l->pos);
+    /* Skip non-space characters (move backwards by UTF-8 chars). */
    while (l->pos > 0 && l->buf[l->pos-1] != ' ')
-        l->pos--;
+        l->pos -= utf8PrevCharLen(l->buf, l->pos);
    diff = old_pos - l->pos;
-    memmove(l->buf+l->pos,l->buf+old_pos,l->len-old_pos+1);
+    memmove(l->buf+l->pos, l->buf+old_pos, l->len-old_pos+1);
    l->len -= diff;
    refreshLine(l);
 }
@@ -886,6 +1273,7 @@ int linenoiseEditStart(struct linenoiseState *l, int stdin_fd, int stdout_fd, ch

    l->cols = getColumns(stdin_fd, stdout_fd);
    l->oldrows = 0;
+    l->oldrpos = 1;  /* Cursor starts on row 1. */
    l->history_index = 0;

    /* Buffer starts empty. */
@@ -895,7 +1283,7 @@ int linenoiseEditStart(struct linenoiseState *l, int stdin_fd, int stdout_fd, ch
    /* If stdin is not a tty, stop here with the initialization. We
     * will actually just read a line from standard input in blocking
     * mode later, in linenoiseEditFeed(). */
-    if (!isatty(l->ifd)) return 0;
+    if (!isatty(l->ifd) && !getenv("LINENOISE_ASSUME_TTY")) return 0;

    /* The latest history entry is always our current buffer, that
     * initially is just an empty string. */
@@ -928,7 +1316,7 @@ char *linenoiseEditMore = "If you see this, you are misusing the API: when linen
 char *linenoiseEditFeed(struct linenoiseState *l) {
    /* Not a TTY, pass control to line reading without character
     * count limits. */
-    if (!isatty(l->ifd)) return linenoiseNoTTY();
+    if (!isatty(l->ifd) && !getenv("LINENOISE_ASSUME_TTY")) return linenoiseNoTTY();

    char c;
    int nread;
@@ -985,11 +1373,17 @@ char *linenoiseEditFeed(struct linenoiseState *l) {
        }
        break;
    case CTRL_T:    /* ctrl-t, swaps current character with previous. */
+        /* Handle UTF-8: swap the two UTF-8 characters around cursor. */
        if (l->pos > 0 && l->pos < l->len) {
-            int aux = l->buf[l->pos-1];
-            l->buf[l->pos-1] = l->buf[l->pos];
-            l->buf[l->pos] = aux;
-            if (l->pos != l->len-1) l->pos++;
+            char tmp[32];
+            size_t prevlen = utf8PrevCharLen(l->buf, l->pos);
+            size_t currlen = utf8NextCharLen(l->buf, l->pos, l->len);
+            size_t prevstart = l->pos - prevlen;
+            /* Copy current char to tmp, move previous char right, paste tmp. */
+            memcpy(tmp, l->buf + l->pos, currlen);
+            memmove(l->buf + prevstart + currlen, l->buf + prevstart, prevlen);
+            memcpy(l->buf + prevstart, tmp, currlen);
+            if (l->pos + currlen <= l->len) l->pos += currlen;
            refreshLine(l);
        }
        break;
@@ -1061,7 +1455,22 @@ char *linenoiseEditFeed(struct linenoiseState *l) {
        }
        break;
    default:
-        if (linenoiseEditInsert(l,c)) return NULL;
+        /* Handle UTF-8 multi-byte sequences. When we receive the first byte
+         * of a multi-byte UTF-8 character, read the remaining bytes to
+         * complete the sequence before inserting. */
+        {
+            char utf8[4];
+            int utf8len = utf8ByteLen(c);
+            utf8[0] = c;
+            if (utf8len > 1) {
+                /* Read remaining bytes of the UTF-8 sequence. */
+                int i;
+                for (i = 1; i < utf8len; i++) {
+                    if (read(l->ifd, utf8+i, 1) != 1) break;
+                }
+            }
+            if (linenoiseEditInsert(l, utf8, utf8len)) return NULL;
+        }
        break;
    case CTRL_U: /* Ctrl+u, delete the whole line. */
        l->buf[0] = '\0';
@@ -1095,7 +1504,7 @@ char *linenoiseEditFeed(struct linenoiseState *l) {
 * returns something different than NULL. At this point the user input
 * is in the buffer, and we can restore the terminal in normal mode. */
 void linenoiseEditStop(struct linenoiseState *l) {
-    if (!isatty(l->ifd)) return;
+    if (!isatty(l->ifd) && !getenv("LINENOISE_ASSUME_TTY")) return;
    disableRawMode(l->ifd);
    printf("\n");
 }
@@ -1193,7 +1602,7 @@ static char *linenoiseNoTTY(void) {
 char *linenoise(const char *prompt) {
    char buf[LINENOISE_MAX_LINE];

-    if (!isatty(STDIN_FILENO)) {
+    if (!isatty(STDIN_FILENO) && !getenv("LINENOISE_ASSUME_TTY")) {
        /* Not a tty: read from file / pipe. In this mode we don't want any
         * limit to the line size, so we call a function to handle that. */
        return linenoiseNoTTY();
--- a/linenoise.h
+++ b/linenoise.h
@@ -65,6 +65,7 @@ struct linenoiseState {
    size_t len;         /* Current edited line length. */
    size_t cols;        /* Number of columns in terminal. */
    size_t oldrows;     /* Rows used by last refrehsed line (multiline mode) */
+    int oldrpos;        /* Cursor row from last refresh (for multiline clearing). */
    int history_index;  /* The history index we are currently editing. */
 };