Bruger:Byrial/programmer/simple title.c

#include <stdio.h>
#include <ctype.h>
#include <string.h>
#include <wctype.h>
#include <wchar.h>
#include "simple_title.h"
#include <sys/types.h>
#include <regex.h>
#include <stdlib.h>

regex_t *get_preg ()
{
  static int init = 0;
  static regex_t preg;
  if (! init)
  {
    int rc = regcomp (&preg,
		      "^(i|ii|iii|iiii|iv|v|vi|vii|viii|ix|x|xi|xii|xiii|"
		      "I|II|III|IIII|IV|V|VI|VII|VIII|IX|X|XI|XII|XIII)(_|\\(|$)",
		      REG_EXTENDED);
    if (rc)
    {
      char errbuf[128];
      regerror (rc, &preg, errbuf, sizeof errbuf);
      printf ("regcomp() failed with code %d: %s\n", rc, errbuf);      
      exit (1);
    }
    init = 1;
  }
  return &preg;
}

/*
 * Convert roman numerals. Works for values 1 to 13
 */
int convert_roman (const char *s)
{
  int i = 0;
  while (*s == 'i' || *s == 'I')
  {
    i ++;
    s ++;
  }
  if (*s == 'v' || *s == 'V')
  {
    i = 5 - i;
    s ++;
  }
  else if (*s == 'x' || *s == 'X')
  {
    i = 10 - i; 
    s ++;
  }
  while (*s == 'i' || *s == 'I')
  {
    i ++;
    s ++;
  }
  return i;
}

/*
 * Make a simplified title
 */
const char *make_simple_title (const char *title, int id,
			       bool *utf8error, const char *lang)
{ 
  static char simple_title[256];
  char *from = (char *) title;

  // Pass articles etc.
  if (strncmp (title, "Den_", 4) == 0 || // Danish article
      strncmp (title, "Det_", 4) == 0 || // Danish article
      strncmp (title, "The_", 4) == 0 || // English article
      strncmp (title, "Dei_", 4) == 0 || // Nynorsk article
      strncmp (title, "Sir_", 4) == 0)   // English title
  {
    from += 4;
  }
  else if (strncmp (title, "De_", 3) == 0) // Danish article
  {
    from += 3;
  }

  char *to = simple_title;
  while (*from)
  {
    unsigned char ch = *from;

    if (ch < 128)
    {	     
      // ASCII character

      if (ch == '_' || ch == '(')
      {
	// Check for some Roman numerals
	regex_t *preg = get_preg();
	regmatch_t pmatch[2];
	int rc = regexec (preg, from + 1, 2, pmatch, 0);
	if (rc == 0)
	{
	  // match
	  int i = convert_roman (from + 1);
	  if (i > 9)
	  {
	    *to = '0' + (i / 10);
	    to ++;
	  }
	  *to = '0' + (i % 10);
	  to ++;
	  from += pmatch[0].rm_eo;
	}
	else if (rc == REG_NOMATCH)
	{
	  // No match - drop the character 
	}
	else
	{
	  char errbuf[128];
	  regerror (rc, preg, errbuf, sizeof errbuf);
	  printf ("regexec() failed with code %d: %s\n", rc, errbuf);      
	  exit (1);
	}
      }

      if (isalnum (ch))
      {
	*to = tolower (ch);
	++ to;
      }
      ++ from;
    }
    else
    {
      // multibyte UTF-8 character
      wchar_t wch;
      size_t wch_len = mbrtowc (&wch, from, 6, NULL);
      if (wch_len == (size_t) -1)
      {
	*utf8error = true;
	// printf ("make_simple_title: "
	//	"Invalid UTF-8 character in '%s', id = %d\n",
	//	title, id);
	// Skip this byte
	++ from;
	continue;
      }
      if (wch_len == (size_t) -2)
      {
	// This should never happen
	printf ("make_simple_title: "
		"Too long multibyte char in '%s', id = %d\n",
		title, id);
	// Skip this byte
	++ from;
	continue;
      }
      from += wch_len;
      
      if  (! iswalnum (wch))
      {
	if (wch == L'¹')
	  *to ++ = '1';
	else if (wch == L'²')
	  *to ++ = '2';
	else if (wch == L'³')
	  *to ++ = '3';
	continue;
      }
      
      wch = towlower (wch);
      
      if ((wch >= 0xFF10 && wch <= 0xFF19)	// Fullwidth digits 0-9
	  || (wch >= 0xFF41 && wch <= 0xFF5A))	// Fullwidth letters a-z
      {
	*to ++ = (wch - 0xFF10 + '0');
	continue;
      }
      else if (wch >= 0x1D538 && wch <= 0x1D551) // Mathematical double-struck capital A-Z
      {
	*to ++ = (wch - 0x1D538 + 'a');
	continue;
      }
      else if (wch >= 0x1D552 && wch <= 0x1D56B) // Mathematical double-struck small a-z
      {
	*to ++ = (wch - 0x1D552 + 'a');
	continue;
      }
      else if (wch >= 0x2170 && wch <= 0x2178) // Small Roman Numeral 1-9
      {
	*to ++ = (wch - 0x2170 + '1'); // Is conversion to letters better?
	continue;
      }

      if (strcmp(lang, "is") == 0)
      {
	if (wch == L'á' ||
	    wch == L'ð' || // Transcribed d in Danish
	    wch == L'é' ||
	    wch == L'í' ||
	    wch == L'ó' ||
	    wch == L'ú' ||
	    wch == L'ý' ||
	    wch == L'þ') // Transcribed th in Danish
	  // These are all icelandic normal letters - use them as is
	{
	  to += wcrtomb (to, wch, NULL);
	  continue;
	}
      }

      switch (wch)
      {
      case L'ȝ':	// U+21D Letter yogh in Middle English
	*to ++ = '3';	// number 3
	break;

      case L'ä': // German, Swedish
      case L'æ': // Icelandic, Danish, Norwegian
      case L'ǽ':
      case L'ǣ':
      case L'œ': // oe ligatur (French, Latin),
	// but, alas, also seen used instead of æ
	*to ++ = 'a';
	*to ++ = 'e';
	break;
	
      case L'ö': // Icelandic, Swedish, German
      case L'ø': // Danish, Norgevian
      case L'ǿ':
	*to ++ = 'o';
	*to ++ = 'e';
	break;
	
      case L'å': // Danish, Norgevian
      case L'ǻ':
	*to ++ = 'a';
	*to ++ = 'a';
	break;
	
	// Just ignore all other diacritics
      case L'á':
      case L'à':
      case L'â':
      case L'ã':
      case L'ă':
      case L'ā':
      case L'ą':
      case L'ạ':
      case L'ª':
      case L'ǎ':
      case L'ả':
      case L'ấ':
      case L'ẫ':
      case L'ậ':
      case L'ắ':
      case L'ẩ':
      case L'ầ':
      case L'ằ':
      case L'ặ':
      case L'ɐ': // U+0250 Near-open central vowel
	*to ++ = 'a';
	break;
	
      case L'ḃ':
      case L'ƀ':
      case L'ƅ':
      case L'ɓ': // U+0253 (IPA voiced bilabial implosive, used in African languages)
      case L'ḇ':
	*to ++ = 'b';
	break;
	
      case L'ç':
      case L'ć':
      case L'č':
      case L'ĉ':
      case L'ℂ':
      case L'ċ':
	*to ++ = 'c';
	break;
	
      case L'đ':
      case L'ď':
      case L'ḍ':
      case L'ḋ':
      case L'ð': // Transcribed d in Danish
      case L'ɖ':
      case L'ɗ': // U+0257 (IPA voiced dental or alveolar implosive, used in African languages)
      case L'ḏ':
	*to ++ = 'd';
	break;

      case L'dž': 
      case L'dz': 
	*to ++ = 'd';
	*to ++ = 'z';
	break;
	
      case L'é':
      case L'è':
      case L'ê':
      case L'ë':
      case L'ė':
      case L'ē':
      case L'ě':
      case L'ę':
      case L'ə':
      case L'ĕ':
      case L'ễ':
      case L'ế':
      case L'ệ':
      case L'ℓ':
      case L'ẹ':
      case L'ề':
      case L'ể':
      case L'ẽ':
      case L'ḗ':
      case L'ǝ': // U+01DD
	*to ++ = 'e';
	break;
	
      case L'ḟ':
      case L'ƒ':
	*to ++ = 'f';
	break;
	
      case L'fi': // U+FB01 - fi ligature
	*to ++ = 'f';
	*to ++ = 'i';
	break;

      case L'fl': // U+FB02 - fl ligature
	*to ++ = 'f';
	*to ++ = 'l';
	break;
	
      case L'ğ':
      case L'ĝ':
      case L'ģ':
      case L'ġ':
      case L'ǧ':
      case L'ḡ':
      case L'ǵ':	// U+01F5
      case L'ǥ':	// U+01E5
	*to ++ = 'g';
	break;
	
      case L'ĥ':
      case L'ћ':
      case L'ḥ':
      case L'ħ':
      case L'ḫ':	// U+1E2B
      case L'ẖ':	// U+1E96
	*to ++ = 'h';
	break;
	
      case L'í':
      case L'ì':
      case L'î':
      case L'ĩ':
      case L'ï':
      case L'ı':
      case L'ī':
      case L'ї':
      case L'ĭ':
      case L'i':	// Normal i - lowercase of İ
      case L'ǐ':
      case L'ị':
      case L'ɨ':
      case L'į':
      case L'ỉ':
      case L'ɪ':	// U+26A IPA Near-close near-front unrounded vowel
	*to ++ = 'i';
	break;
	
      case L'ij':
	*to ++ = 'i';
	*to ++ = 'j';
	break;
	
      case L'ĵ':
      case L'ʝ':	// U+029D IPA Voiced palatal fricative
	*to ++ = 'j';
	break;
	
      case L'ĸ':
	*to ++ = 'k';
	break;
	
      case L'ќ':
      case L'ķ':
      case L'ḵ':
      case L'ḱ':	// U+1E31
      case L'ⱪ':	// U+2C6A
	*to ++ = 'k';
	break;
	
      case L'ł':
      case L'ļ':
      case L'ľ':
      case L'ḷ':
      case L'ǃ':
      case L'ĺ':
      case L'ɬ':
	*to ++ = 'l';
	break;
	
      case L'ṁ':
      case L'ṃ':
	*to ++ = 'm';
	break;
	
      case L'ñ':
      case L'ń':
      case L'ň':
      case L'ņ':
      case L'ṇ':
      case L'ŋ':
      case L'ℕ':
      case L'ṉ':
      case L'n':
      case L'ṅ':	// U+1E45
	*to ++ = 'n';
	break;
	
      case L'ó':
      case L'ò':
      case L'ô':
      case L'õ':
      case L'ō':
      case L'ő':
      case L'ọ':
      case L'ǫ':
      case L'º':
      case L'ơ':
      case L'ồ':
      case L'ố':
      case L'ờ':
      case L'ổ':
      case L'ớ':
      case L'ỗ':
      case L'ŏ':
      case L'ǒ':
      case L'ộ':
      case L'ợ':
	//      case L'o':
      case L'ỏ':
      case L'ở': // U+1EDF
	*to ++ = 'o';
	break;
	
      case L'ṗ':
	//      case L'p':
	*to ++ = 'p';
	break;
	
      case L'ℚ':
	*to ++ = 'q';
	break;
	
      case L'ř':
      case L'ℝ':
      case L'ŕ':
      case L'ṛ':
      case L'ȑ':
      case L'ŗ':
	*to ++ = 'r';
	break;
	
      case L'š':
      case L'ş':
      case L'ś':
      case L'ſ':
      case L'ŝ':
      case L'ș':
      case L'ṣ':
      case L'ṡ':
      case L'ʂ':
	*to ++ = 's';
	break;
	
      case L'ß':
	*to ++ = 's';
	*to ++ = 's';
	break;
	
      case L'ţ':
      case L'ť':
      case L'ṭ':
      case L'ț':
      case L'ṫ':
      case L'ŧ':
      case L'ṯ': // U+1E6F
	*to ++ = 't';
	break;
	
      case L'þ': // Transcribed th in Danish
	*to ++ = 't';
	*to ++ = 'h';
	break;
		  
      case L'ú':
      case L'ù':
      case L'û':
      case L'ũ':
      case L'ū':
      case L'ů':
      case L'ư':
      case L'ǔ':
      case L'ų':
      case L'ŭ':
      case L'џ':
      case L'ṳ':
      case L'ű':
      case L'ứ':
      case L'ữ':
      case L'ự':
      case L'ừ':
      case L'µ':
      case L'ủ':
      case L'ǖ':
      case L'ǘ':
      case L'ǚ':
      case L'ǜ':
      case L'ụ':
      case L'ử':
	*to ++ = 'u';
	break;

      case L'ʋ':	//  U+28B Based on italic V, used in IPA and some African languages  
	*to ++ = 'v';
	break;

      case L'ŵ':
      case L'ẁ':
      case L'ẃ':
      case L'ẅ':
	*to ++ = 'w';
	break;
	
      case L'ý':
      case L'ü':
      case L'ŷ':
      case L'ÿ':
      case L'ỳ':
      case L'ỹ':
      case L'ў':
      case L'ẏ':
      case L'y':
	*to ++ = 'y';
	break;
	
      case L'ż':
      case L'ž':
      case L'ź':
      case L'ℤ':
      case L'Ẓ':
      case L'ẓ':
      case L'ẑ':
      case L'ʒ':	// U+0292 - ezh or tailed z see http://en.wikipedia.org/wiki/Ezh_%28letter%29
			// IPA for Voiced postalveolar fricative
	*to ++ = 'z';
	break;
	
      case L'ǀ': // IPA dental click (0x01C0)
      case L'ǂ': // IPA paletal click (0x01C2)
	*to ++ = '|'; // Vertical bar
	break;

      case L'ǁ': // IPA lateral click
	*to ++ = '|'; // Vertical bar
	*to ++ = '|'; // Vertical bar
	break;

      case L'ʔ': // Glottal stop (0x0294)
	*to ++ = '?'; // Question mark
	break;

      case L'ʾ': // U+02BE - Modifier letter right half ring -
	         // Used in translitterations - drop it.
      case L'ʿ': // 0x02BF - Modifier letter left half ring -
	         // Used in latin translitterations of Hebrew, Arabic and other. Drop it.
      case L'ʼ': // (0x02BC - "modifier letter apostrophe") - Drop it.
      case L'ʻ': // "U+02BB MODIFIER LETTER TURNED COMMA" - Drop it.
      case L'ˁ': // U+02C1 Superscript voiced pharyngeal fricative - Drop it.
      case L'ˇ': // U+02C7 Caron or háček - Drop it.
      case L'ˊ': // U+02CA - Drop it.
      case L'ˤ': // U+02E4 Superscript voiced pharyngeal fricative - Drop it.
	break;

      case L'ɣ': // U+0263 Latin gamma used in some African languages.
	to += wcrtomb (to, L'γ', NULL); //  Replace with greek gamma
	break;

      case L'ʊ': // U+028A Latin upsilon used in IPA and some African languages.
	to += wcrtomb (to, L'ω', NULL); //  Replace with greek omega
	break;

	// Signs tobe dropped:
      case L'ː':	// U+02D0 Vowel length marker (looks like colon which is dropped)
      case L'ˆ':	// U+02C6 Modifier letter circumflex accent
      case L'ƿ':	// U+01BF Old English letter wynn (translitteration?)
	break;

      default:
	// See http://en.wikipedia.org/wiki/Mapping_of_Unicode_character_planes
	if (
	       (wch >= 0x0370 && wch <= 0x03FF) // Greek and Coptic
	    || (wch >= 0x0400 && wch <= 0x04FF) // Cyrillic
	    || (wch >= 0x0530 && wch <= 0x058F) // Armenian
	    || (wch >= 0x0590 && wch <= 0x05FF) // Hebrew
	    || (wch >= 0x0600 && wch <= 0x06FF) // Arabic
	    || (wch >= 0x0700 && wch <= 0x074F) // Syriac
	    || (wch >= 0x0750 && wch <= 0x077F) // Arabic Supplement
	    || (wch >= 0x0780 && wch <= 0x07BF) // Thaana
	    || (wch >= 0x0900 && wch <= 0x097F) // Devanagari
	    || (wch >= 0x0980 && wch <= 0x09FF) // Bengali
	    || (wch >= 0x0A00 && wch <= 0x0A7F) // Gurmukhi
	    || (wch >= 0x0A80 && wch <= 0x0AFF) // Gujarati
	    || (wch >= 0x0B00 && wch <= 0x0B7F) // Oriya
	    || (wch >= 0x0B80 && wch <= 0x0BFF) // Tamil
	    || (wch >= 0x0C00 && wch <= 0x0C7F) // Telugu
	    || (wch >= 0x0C80 && wch <= 0x0CFF) // Kannada
	    || (wch >= 0x0D00 && wch <= 0x0D7F) // Malayalam
	    || (wch >= 0x0D80 && wch <= 0x0DFF) // Sinhala
	    || (wch >= 0x0E00 && wch <= 0x0EFF) // Thai
	    || (wch >= 0x0F00 && wch <= 0x0FFF) // Tibetan
	    || (wch >= 0x10A0 && wch <= 0x10FF) // Georgian
	    || (wch >= 0x1200 && wch <= 0x137F) // Ethiopic
	    || (wch >= 0x1380 && wch <= 0x139F) // Ethiopic Supplement
	    || (wch >= 0x13A0 && wch <= 0x13FF) // Cherokee
	    || (wch >= 0x1400 && wch <= 0x167F) // Unified Canadian Aboriginal Syllabics
	    || (wch >= 0x16A0 && wch <= 0x16FF) // Runic
	    || (wch >= 0x1780 && wch <= 0x17FF) // Khmer
	    || (wch >= 0x1800 && wch <= 0x18AF) // Mongolian
	    || (wch >= 0x1F00 && wch <= 0x1FFF) // Greek Extended
	    || (wch >= 0x3040 && wch <= 0x309F) // Hiragana
	    || (wch >= 0x3000 && wch <= 0x303F) // CJK Symbols and Punctuation
	    || (wch >= 0x30A0 && wch <= 0x30FF) // Katakana
	    || (wch >= 0x3400 && wch <= 0x4DBF) // CJK Inified Ideographs Extension A
	    || (wch >= 0x4E00 && wch <= 0x9FFF) // CJK Unified Ideographs
	    || (wch >= 0xAC00 && wch <= 0xD7AF) // Hangul
	    || (wch >= 0xFB50 && wch <= 0xFDFF) // Arabic Presentation Forms-A
	    || (wch >= 0xFE70 && wch <= 0xFEFF) // Arabic Presentation Forms-B
	    || (wch >= 0x10330 && wch <= 0x1034F) // Gothic
	    || (wch >= 0x20000 && wch <= 0x2A6DF) // CJK Unified Ideographs Extension B
	    )
	{
	  // Character from known, but unhandled script
	  to += wcrtomb (to, wch, NULL);
	}
	else
	{
	  // 	if ((wch >= 0x02B0 && wch <= 0x02FF) // Spacing Modifier Letters
	  //       - needs individuel treatment

	  // Character not handled yet ...
	  // Print message if the title contains more than this character:
	  
	  if (strlen (title) > wch_len) 
	    printf ("Unhandled character in '%s' (%lc), id = %d, code = %d (%X)\n",
		    title, wch, id, wch, wch);
	  
	  // Take everything not catched as is
	  to += wcrtomb (to, wch, NULL);
	}
	break;
      }
    }	
  }
  *to = '\0';
  return simple_title;   
}