profile picture

More about UTF-8

August 02, 2008 - programming unicode c utf-8

Now suppose you want to rid string of malformed utf-8 characters.

Here is an example that checks an utf-8 string and replaces all malformed characters with '?'. The same idea as in the previous post. Note, that it checks only encoding, it is possible that encoding is correct, but the symbol doesn't exist.

#include <unistd.h>
#include <string.h>
#include <stdio.h>
#define CHECK(n) ((*(c+n) & 192) == 128)

void utf8_fix_string(char *buf);

main()
{
  char buf[4096];
  while (fgets(buf, 4096, stdin)) {
    utf8_fix_string(buf);
    printf("%s", buf);
  }
}

/*
 * Replace all invalid utf8 characters in buf by '?'
 */

void
utf8_fix_string(char *buf)
{
  size_t len = strlen(buf);
  char   *c = buf;
  while (len > 0) {
    if (*c & 128) {
      if (((*c & 224) == 192) && (len >= 2)
              && CHECK(1)) {
        c += 2;
        len -= 2;
      } else if (((*c & 240) == 224) && len >= 3
              && CHECK(1) && CHECK(2)) {
        c += 3;
        len -= 3;
      } else if (((*c & 248) == 240) && len >= 4
              && CHECK(1) && CHECK(2) && CHECK(3)) {
        c += 4;
        len -= 4;
      } else {
        *c = '?';
        c++;
        len--;
      }
    } else {
      c++;
      len--;
    }
  }
}

Also you can use wchar functions, they are more convenient if you want to actively process text with wide characters, but at the same time they are not very flexible if you need to handle encoding errors in input data. Problem is that position in the stream after fgetwc() failure isn't determined by standard, so following example may work incorrectly on some systems. It tested on ubuntu 8.04 amd64.

#include <stdio.h>
#include <wchar.h>
#include <err.h>
#include <errno.h>
#include <locale.h>

main()
{
  wchar_t buf[4096];
  wint_t  wc;
  mbstate_t s = { 0 };
/*
 * Input is encoded in utf-8
 */
  setlocale(LC_CTYPE, "en_US.UTF-8");
  while (1) {
  /*
   * try to get next char
   */
    wc = fgetwc(stdin);
  /*
   * it may be EOF or broken character
   */
    if (wc == WEOF) {
      switch (errno) {
      case EILSEQ:
        wc = L'?';
        clearerr(stdin);
        errno = 0;
      /*
       * Shift broken byte from the stream.
       * It needed if you use GNU libc, but
       * may not work on other systems.
       */
        fgetc(stdin);
        break;
      case 0:
        goto EXIT_LOOP;
      default:
        err(1, "error on reading stdin");
      }
    }
    fputwc(wc, stdout);
  }
EXIT_LOOP:
  return 0;
}