#!/usr/bin/perl -w
#
#  gen-qgb18030codec.pl
#      a Perl script that generates GB18030 <-> Unicode codec
#      for Qt library and can be extended for other applications.
#
#  Copyright (C) 2001 ThizLinux Laboratory Ltd., Thiz Technology Group Limited
#  Copyright (C) 2013 Thiz.com (Hong Kong) Ltd., Thiz Technology Group Limited
#
#  Written by: Anthony Fok <foka@debian.org> <anthony.fok@thizgroup.com>
#
#      with contributions from:
#              James Su <james.su@gmail.com> <suzhe@turbolinux.com.cn>
#              Yu Mingjian <yumj@sun.ihep.ac.cn> <yumingjian@china.com>
#              Chen Xiangyang <chenxy@sun.ihep.ac.cn>
#
#  License: This script itself is under the
#           GNU General Public License, v2 or later.
#
#           The C/C++ source code generated by this script
#           is under a BSD-style license, GPL, LGPL or QPL.
#           Permission is also granted for the code
#           to be included in Qt commercial versions.
#
#  Initial version:       2001-10-17
#  Last updated:          2001-11-01
#  Documentation updated: 2013-08-12
#
#  The latest version of this Perl script and supporting data is available
#  for download at:
#
#      http://people.debian.org/~foka/gb18030/
#
#  Todo:
#
#    * Migrate to Qt-3.x series
#    * Merge in GBK and GB2312 into this file to save space
#
#  Acknowledgement:
#
#    This script generates mapping data from gb-18030-2000.xml
#    available at the IBM Open Source Software International
#    Components for Unicode (ICU) web site:
#
#      http://source.icu-project.org/repos/icu/data/trunk/charset/data/xml/gb-18030-2000.xml
#
#    Special thanks to charset gurus Markus Schere (IBM), Dirk Meyer
#    (Adobe Systems) and Ken Lunde (Adobe Systems) for publishing
#    excellent GB 18030-2000 summary and specifications on the
#    Internet.  Some must-read documents are:
#
#      ftp://ftp.oreilly.com/pub/examples/nutshell/cjkv/pdf/GB18030_Summary.pdf
#      http://icu-project.org/docs/papers/gb18030.html
#      http://source.icu-project.org/repos/icu/data/trunk/charset/source/gb18030/gb18030.html
#
#    Also thanks to Chinese i18n/L10n gurus Suzhe, Yu Mingjian and
#    Chen Xiangyang whose contributions are included in this script.
#    Also to Bruno Haible for offering various GNU libiconv tables
#    which may be included in future revisions of this code,
#    and to Lars Knoll of Trolltech for including our work in Qt
#    and for his words of encouragement.
#
#    Last but not least, sincere thanks to ThizLinux Laboratory Ltd.,
#    now Thiz.com (Hong Kong) Ltd., Thiz Technology Group Limited,
#    and fellow colleagues who have offered me generous support for
#    this work.
#

use integer;

my (%g2u, %u2g, $uni, $gb, $i, $count);

my $debug = 0;

#
# First, read GBK <-> Unicode 3.0 table.
#
# Note: This is not a real XML parser, so it may fail when line-wrapping
#       or spacing changes in gb-18030-2000.xml.

open (GB18030XML, "gb-18030-2000.xml") or die;
while (<GB18030XML>) {
    if (m|^  <a u="([^"]+)" b="([^"]+)"/>|) {
	# print "Map: ", $2, "\n";
	($uni, $gb) = ($1, $2);
	$g2u{$gb} = $uni;
	$u2g{$uni} = $gb;
    } elsif (m|^  <range uFirst="([^"]+)" uLast="([^"]+)"  bFirst="([^"]+)" bLast="([^"]+)"|) {
	($uFirst, $uLast, $bFirst, $bLast) = ($1, $2, $3, $4);
	push @ranges, [ $uFirst, $uLast, $bFirst, $bLast ];
    }
}
close GB18030XML;

#
# 4-byte GB18030 <-> linear functions
#

sub lin_to_g4($) {
    my $lin = shift;
    my ($a, $b, $c, $d);
    $a = 0x81 + $lin / (0xa * 0x7e * 0xa);
    $b = 0x30 + ($lin / (0x7e * 0xa)) % 0xa;
    $c = 0x81 + ($lin / 0xa) % 0x7e;
    $d = 0x30 + $lin % 0xa;
    return sprintf("%02X %02X %02X %02X", $a, $b, $c, $d);
}

sub g4_to_lin($) {
    my $g4 = shift;
    my ($a, $b, $c, $d) = split " ", $g4;
    $a = hex($a); $b = hex($b); $c = hex($c); $d = hex($d);
    $lin = ($a - 0x81) * 12600 + ($b - 0x30) * 1260 +
	   ($c - 0x81) * 10 + ($d - 0x30);
    return $lin;
}

#
# "Compact" GB18030 conversion functions (for Lookup table)
#
#   0000 0000 0000 0000
#
#  bit  1-4 : 4th byte of GB18030
#  bit  5-11: 3rd byte of GB18030
#  bit 12-15: 2nd and 1st bytes of GB18030
#  bit 16:    0 (to make sure the "compact" code is less than 0x8000)
#

sub g4_to_g4table($) {
    my $g4 = shift;
    my ($a, $b, $c, $d) = split " ", $g4;
    $a = hex($a); $b = hex($b); $c = hex($c); $d = hex($d);

    if ($a == 0x81) {
	# 30, 36, 37, 38, 39
	$offset = 0;
	if ($b >= 0x36 and $b <= 0x39) {
	    $b -= 5;
	} elsif ($b != 0x30) {
	    return 0xffff;
	}	
    } elsif ($a == 0x82) {
	# 30, 31, 32, 33, 34, 35
	$offset = 0x3000;
	if ($b >= 0x36) {
	    return 0xffff;
	}	
    } elsif ($a == 0x83) {
	# 36
	$offset = 0x6000;
	if ($b == 0x36) {
	    $b -= 6;
	} else {
	    return 0xffff;
	}
    } elsif ($a == 0x84) {
	# 30, 31
	$offset = 0x7000;
	if ($b >= 0x32) {
	    return 0xffff;
	}
    }
    $g4table = $offset + (($b - 0x30) << 11) + (($c - 0x81) << 4)
	+ ($d - 0x30);

    return $g4table;
}

sub lin_to_g4table($) {
    my $lin = shift;
    $g4table = g4_to_g4table(lin_to_g4($lin));
    return $g4table;
}

sub g4table_to_g4($) {
    # te = table entry
    my $te = shift;

    return "" if $te == 0xffff;

    my ($a, $b, $c, $d);
    if ($te >= 0x7000) {
	$a = 0x84;
	$b = 0x30 + ($te - 0x7000 >> 11);
    } elsif ($te >= 0x6000) {
	$a = 0x83;
	$b = 0x36 + ($te - 0x6000 >> 11);
    } elsif ($te >= 0x3000) {
	$a = 0x82;
	$b = 0x30 + ($te - 0x3000 >> 11);
    } else {
	$a = 0x81;
	$b = 0x30 + ($te >> 11);
	if ($b >= 0x31) {
	    $b += 5;
	}
    }    

    $c = 0x81 + (($te >> 4) & 0x7F);
    $d = 0x30 + ($te & 0xF);

    return sprintf("%02X %02X %02X %02X", $a, $b, $c, $d);
}


#
# Ranges
#

$ranges_tbl[0][0] = "0080";
$ranges_tbl[0][2] = "81 30 81 30";
for $i (0 .. $#ranges-1) {
    $ranges_tbl[$i][1] = sprintf("%04X", hex($ranges[$i][0]) - 1);
    $ranges_tbl[$i+1][0] = ($ranges[$i][1] eq "D7FF") ? "E000"
	: sprintf("%04X", hex($ranges[$i][1]) + 1);

    $ranges_tbl[$i][3] = lin_to_g4(g4_to_lin($ranges[$i][2]) - 1);
    $ranges_tbl[$i+1][2] = lin_to_g4(g4_to_lin($ranges[$i][3]) + 1);
}
# Don't want the last one (0x10000 -> 0x10FFFF)
pop @ranges_tbl;

for $i (0 .. $#ranges_tbl) {
    $u2gtblranges{$ranges_tbl[$i][0]} = $ranges_tbl[$i][1];
    $g2utblranges{$ranges_tbl[$i][2]} = $ranges_tbl[$i][3];
}

die unless defined $u2gtblranges{"E000"};
$u2gtblranges{"E766"} = $u2gtblranges{"E000"};
$u2gtblranges{"E000"} = "E765";

#
#  Begin output of the actual C/C++ code for the GB18030 codec table
#

print <<'EOT';
/****************************************************************************
** Implementation of QGb18030Codec template/macro class
**
** Copyright (C) 1992-2001 Trolltech AS.  All rights reserved.
**
** This file is part of the tools module of the Qt GUI Toolkit.
**
** This file may be distributed under the terms of the Q Public License
** as defined by Trolltech AS of Norway and appearing in the file
** LICENSE.QPL included in the packaging of this file.
**
** This file may be distributed and/or modified under the terms of the
** GNU General Public License version 2 as published by the Free Software
** Foundation and appearing in the file LICENSE.GPL included in the
** packaging of this file.
**
** Licensees holding valid Qt Enterprise Edition or Qt Professional Edition
** licenses may use this file in accordance with the Qt Commercial License
** Agreement provided with the Software.
**
** This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING THE
** WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
**
** See http://www.trolltech.com/pricing.html or email sales@trolltech.com for
**   information about Qt Commercial License Agreements.
** See http://www.trolltech.com/qpl/ for QPL licensing information.
** See http://www.trolltech.com/gpl/ for GPL licensing information.
**
** Contact info@trolltech.com if any conditions of this licensing are
** not clear to you.
**
**********************************************************************/

/*! \class QGb18030Codec qgb18030codec.h

  \brief This class provides conversion to and from the Chinese
  GB18030/GBK/GB2312 encoding.

      Last updated: November 1, 2001

  GBK, formally the Chinese Internal Code Specification, is a commonly
  used extension of GB 2312-80.  Microsoft Windows uses it under the
  name codepage 936.

  GBK has been superceded by the new Chinese national standard
  GB 18030-2000, which added a 4-byte encoding while remaining
  compatible with GB2312 and GBK.  The new GB18030-2000 may be described
  as a special encoding of Unicode 3.x and ISO-10646-1.

  Special thanks to charset gurus Markus Schere (IBM),
  Dirk Meyer (Adobe Systems) and Ken Lunde (Adobe Systems) for publishing
  excellent GB 18030-2000 summary and specifications on the Internet.
  Some must-read documents are: <ul>
    <li><a href="ftp://ftp.oreilly.com/pub/examples/nutshell/cjkv/pdf/GB18030_Summary.pdf">
    <li><a href="http://source.icu-project.org/repos/icu/data/trunk/charset/source/gb18030/gb18030.html">
    <li><a href="http://source.icu-project.org/repos/icu/data/trunk/charset/data/xml/gb-18030-2000.xml">
  </ul>

  The GBK codec was contributed to Qt by
  Justin Yu \<justiny@turbolinux.com.cn\> and
  Sean Chen \<seanc@turbolinux.com.cn\>.  They may also be reached at
  Yu Mingjian \<yumj@sun.ihep.ac.cn\>, \<yumingjian@china.com\>
  Chen Xiangyang \<chenxy@sun.ihep.ac.cn\>

  The GB18030 codec Qt functions were contributed to Qt by
  James Su \<james.su@gmail.com\>, \<suzhe@turbolinux.com.cn\>
  who pioneered much of GB18030 development on GNU/Linux systems.

  The GB18030 codec was contributed to Qt by
  Anthony Fok \<foka@debian.org\>, \<anthony.fok@thizgroup.com\>
  using a Perl script to generate C++ tables from gb-18030-2000.xml
  while merging contributions from James Su, Justin Yu and Sean Chen.
  A copy of the source Perl script is available at:

      <a href="http://people.debian.org/~foka/gb18030/gen-qgb18030codec.pl">

  The copyright notice for their code follows:

  \mustquote

  Copyright (C) 2000 TurboLinux, Inc.  Written by Justin Yu and Sean Chen.
  Copyright (C) 2001 Turbolinux, Inc.  Written by James Su.
  Copyright (C) 2001 ThizLinux Laboratory Ltd.  Written by Anthony Fok.

  Redistribution and use in source and binary forms, with or without
  modification, are permitted provided that the following conditions
  are met:<ol>
  <li> Redistributions of source code must retain the above copyright
       notice, this list of conditions and the following disclaimer.
  <li> Redistributions in binary form must reproduce the above copyright
       notice, this list of conditions and the following disclaimer in the
       documentation and/or other materials provided with the distribution.
  </ol>

  THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  SUCH DAMAGE.
*/


#include "qgb18030codec.h"

#ifndef QT_NO_CODECS

#define InRange(c, lower, upper)  (((c) >= (lower)) && ((c) <= (upper)))
#define IsLatin(c)	((c) <= 0x7F)
#define Is1stByte(c)	(InRange((c), 0x81, 0xFE))
#define Is2ndByteIn2Bytes(c)	(InRange((c), 0x40, 0xFE) && (c) != 0x7F)
#define Is2ndByteIn4Bytes(c)	(InRange((c), 0x30, 0x39))
#define Is2ndByte(c)	(Is2ndByteIn2Bytes(c) || Is2ndByteIn4Bytes(c))
#define Is3rdByte(c)	(InRange((c), 0x81, 0xFE))
#define Is4thByte(c)	(InRange((c), 0x30, 0x39))

#define QValidChar(u)	((u) ? QChar((ushort)(u)) : QChar::replacement)

/* User-defined areas:	UDA 1: 0xAAA1 - 0xAFFE (564/0)
			UDA 2: 0xF8A1 - 0xFEFE (658/0)
			UDA 3: 0xA140 - 0xA7A0 (672/0) */
#define IsUDA1(a, b)	(InRange((a), 0xAA, 0xAF) && InRange((b), 0xA1, 0xFE))
#define IsUDA2(a, b)	(InRange((a), 0xF8, 0xFE) && InRange((b), 0xA1, 0xFE))
#define IsUDA3(a, b)	(InRange((a), 0xA1, 0xA7) && InRange((b), 0x40, 0xA0) && ((b) != 0x7F))

typedef struct {
    Q_UINT8	tblBegin;
    Q_UINT8	tblEnd;
    Q_UINT16	tblOffset;
    Q_UINT16	algOffset;
} indexTbl_t;

static uint qt_Gb18030ToUnicode(const uchar *gbstr, int& len);
static int qt_UnicodeToGb18030(uint unicode, uchar *gbchar);

/*! \internal */
QGb18030Codec::QGb18030Codec()
{
}

/*! \reimp */
int QGb18030Codec::mibEnum() const
{
    /* There is no MIBEnum for GB18030 now (GB2312 uses 2025).
       Suzhe has temporary selected 2500.
       See http://www.iana.org/assignments/character-sets
    */
    return 2500;
}

/*! \reimp */
QCString QGb18030Codec::fromUnicode(const QString& uc, int& len_in_out) const
{
    int l = QMIN((int)uc.length(),(len_in_out<0)?(int)uc.length():len_in_out);
    int rlen = l*4+1;
    QCString rstr(rlen);
    uchar* cursor = (uchar*)rstr.data();

    for (int i=0; i<l; i++) {
	QChar ch = uc[i];
	int len;
	uchar buf[4];

	if ( ch.row() == 0x00 && ch.cell() < 0x80 ) {
	    // ASCII
	    *cursor++ = ch.cell();
	} else if ( (len = qt_UnicodeToGb18030(ch.unicode(), buf)) != 0 ) {
	    for (int j=0; j<len; j++)
		*cursor++ = buf[j];
	} else {
	    // Error
	    *cursor++ = '?';	// unknown char
	}
    }

    len_in_out = cursor - (uchar*)rstr.data();
    rstr.truncate(len_in_out);
    return rstr;
}

/*! \reimp */
QString QGb18030Codec::toUnicode(const char* chars, int len) const
{
    QString result;
    int clen;

    for (int i=0; i<len; ) {
	uchar ch = chars[i];

	if ( IsLatin(ch) ) {
	    // ASCII
	    result += QChar(ch);
	    i++;
	} else if ( Is1stByte(ch) ) {
	    // GB18030 ?
	    clen = len - i;
	    uint u = qt_Gb18030ToUnicode( (const uchar*)(chars + i), clen );

	    if (clen == 2 || clen == 4) {
		result += QValidChar(u);
		i += clen;
	    } else if (i < len) {
		result += QChar::replacement;
		i++;
	    }
	} else {
	    // Invalid or undefined
	    result += QChar::replacement;
	    i++;
	}
    }
    return result;
}

/*! \reimp */
const char* QGb18030Codec::name() const
{
    return "GB18030";
}

/*! \reimp */
int QGb18030Codec::heuristicNameMatch(const char* hint) const
{
    int score = 0;
    bool zh = FALSE;
    if (qstrnicmp(hint, "zh_CN", 5) == 0){
	score += 10;
	zh = TRUE;
    }
    const char *p;
    if ( zh ) {
    	p = strchr(hint, '.');
        if ( p == 0 )
      	    return score;
        p++;
    } else {
    	p = hint;
    }
    if (p) {
    	if (qstricmp(p, "GB18030") == 0)
      	    return score + 14;
// Commented out until GBK and GB2312 support is added in this file.
#if 0
    	if (qstricmp(p, "GBK") == 0)
      	    return score + 12;
	else if (qstricmp(p, "GB2312") == 0)
	    return score + 12;
#endif
    }
    return QTextCodec::heuristicNameMatch(hint);
}

/*! \reimp */
int QGb18030Codec::heuristicContentMatch(const char* chars, int len) const
{
    int score = 0;
    for (int i=0; i<len; i++) {
	uchar ch = chars[i];
	// No nulls allowed.
	if ( !ch )
	    return -1;

	if (ch < 32 && ch != '\t' && ch != '\n' && ch != '\r') {
	    // Suspicious
	    if ( score )
		score--;
	} else if ( ch < 0x80 ) {
	    // Inconclusive
	    score++;
	} else if ( Is1stByte(ch) ) {
	    if ( i < len-1 ) {
		uchar ch2 = chars[++i];
		if ( Is2ndByteIn4Bytes(ch2) && i < len-2 ) {
		    uchar ch3 = chars[++i];
		    if ( Is3rdByte(ch3) && i < len-1 ) {
			uchar ch4 = chars[++i];
			if ( !Is4thByte(ch4) )
			    return -1;
			score += 2;
		    } else {
			return -1;
		    }
		} else if ( !Is2ndByteIn2Bytes(ch2) ) {
		    return -1;
		} else {
		    score += 2;
		}
	    }
	    score++;
	} else {
	    // Invalid
	    return -1;
	}
    }
    return score;
}

class QGb18030Decoder : public QTextDecoder {
    uchar buf[4];
    int nbuf;
public:
    QGb18030Decoder() : nbuf(0)
    {
    }
    QString toUnicode(const char* chars, int len)
    {
	QString result;
	for (int i=0; i<len; i++) {
	    uchar ch = chars[i];
	    switch (nbuf) {
	      case 0:
		if ( ch < 0x80 ) {
		    // ASCII
		    result += QChar(ch);
		} else if ( Is1stByte(ch) ) {
		    // GB18030?
		    buf[0] = ch;
		    nbuf = 1;
		} else {
		    // Invalid
		    result += QChar::replacement;
		}
		break;
	      case 1:
		// GB18030 2 bytes
		if ( Is2ndByte(ch) ) {
		    buf[1] = ch;
		    int clen = 2;
		    uint u = qt_Gb18030ToUnicode(buf, clen);
		    if (clen == 2) {
			result += QValidChar(u);
		    } else {
			result += QChar::replacement;
		    }
		    nbuf = 0;
		} else if ( Is2ndByteIn4Bytes(ch) ) {
		    buf[1] = ch;
		    nbuf = 2;
		} else {
		    // Error
		    result += QChar::replacement;
		    nbuf = 0;
		}
	        break;
	      case 2:
	        // GB18030 3 bytes
		if ( Is3rdByte(ch) ) {
		    buf[2] = ch;
		    nbuf = 3;
		} else {
		    result += QChar::replacement;
		    nbuf = 0;
		}
	        break;
	      case 3:
	        // GB18030 4 bytes
		if ( Is4thByte(ch) ) {
		    buf[3] = ch;
		    int clen = 4;
		    uint u = qt_Gb18030ToUnicode(buf, clen);
		    if (clen == 4) {
			result += QValidChar(u);
		    } else {
			result += QChar::replacement;
		    }
		} else {
		    result += QChar::replacement;
		}
	        nbuf = 0;
	        break;
	    }
	}
	return result;
    }
};

/*! \reimp */
QTextDecoder* QGb18030Codec::makeDecoder() const
{
    return new QGb18030Decoder();
}

EOT


#
#  GB18030-to-Unicode intermediate index table (2-byte and 4-byte GB18030)
#
print "static const indexTbl_t gb18030_to_ucs_index[154] = {\n";
for $i (0x00..0x99) {
    my ($tblBegin, $tblEnd, $tblOffset, $algOffset);
    $tblOffset = 0x0;

    my $matched = 0;
    for $j (0 .. $#ranges) {
	$a = g4_to_lin($ranges[$j][2]);
	$b = g4_to_lin($ranges[$j][3]);
	$tblOffset += $b - $a + 1 if $i > ($a >> 8);
	if ($i >= $a >> 8 and $i <= $b >> 8) {
	    $matched = 1;
	    $algOffset = hex($ranges[$j][0])
		- ($a & 0xFF) + 256 * ($i - ($a >> 8));
	    last;
	}
    }
    if (not $matched) {
	$algOffset = 0;
    }

    $matched = 0;
    foreach $range (@ranges_tbl) {
	$a = g4_to_lin(@$range[2]);
	$b = g4_to_lin(@$range[3]);
	if ($i >= $a >> 8 and $i <= $b >> 8) {
	    $matched = 1;
	    $tblBegin = ($i == $a >> 8) ? $a & 0xFF : 0x00;
	    $tblEnd   = ($i == $b >> 8) ? $b & 0xFF : 0xFF;
	    last;
	}
    }
    if (not $matched) {
	($tblBegin, $tblEnd) = (0xFF, 0x00);
    }

    printf("  /* U+%02X__ */", $i) if $i % 2 == 0;
    printf("\t{0x%02X, 0x%02X, 0x%04X, 0x%04X},",
	   $tblBegin, $tblEnd, $tblOffset, $algOffset);
    print "\n" if $i % 2 == 1;
}
print "};\n\n";


#
#  Unicode-to-GB18030 Index table (2-byte and 4-byte GB18030)
#
print "static const indexTbl_t ucs_to_gb18030_index[256] = {\n";
for $i (0x00..0xFF) {
    my ($tblBegin, $tblEnd, $tblOffset, $algOffset);
    $tblOffset = 0x80;

    my $matched = 0;
    for $j (0 .. $#ranges) {
	$a = hex($ranges[$j][0]);
	$b = hex($ranges[$j][1]);
	$tblOffset += $b - $a + 1 if $i > ($a >> 8);
	if ($i >= $a >> 8 and $i <= $b >> 8) {
	    $matched = 1;
	    $algOffset = g4_to_lin($ranges[$j][2])
		- ($a & 0xFF) + 256 * ($i - ($a >> 8));
	    last;
	}
    }
    if (not $matched) {
	$algOffset = 0;
    }

    $matched = 0;
    foreach $j (sort keys %u2gtblranges) {
	$a = hex($j);
	$b = hex($u2gtblranges{$j});
	if ($i >= $a >> 8 and $i <= $b >> 8) {
	    $matched = 1;
	    $tblBegin = ($i == $a >> 8) ? $a & 0xFF : 0x00;
	    $tblEnd   = ($i == $b >> 8) ? $b & 0xFF : 0xFF;
	    last;
	}
    }
    if (not $matched) {
	($tblBegin, $tblEnd) = (0xFF, 0x00);
    }
    if ($i >= 0xE0 and $i <= 0xE6) {
	($tblBegin, $tblEnd) = (0xFF, 0x00);
    }

    if ($i >= 0xE0) {
	$tblOffset += 0xF66;
    }
    if ($i == 0xE7) {
	($tblBegin, $tblEnd) = (0x66, 0xFF);
    }

    printf("  /* 0x%02X__ */", $i) if $i % 2 == 0;
    printf("\t{0x%02X, 0x%02X, 0x%04X, 0x%04X},",
	   $tblBegin, $tblEnd, $tblOffset, $algOffset);
    print "\n" if $i % 2 == 1;
}
print "};\n\n";


# 
#  Unicode-to-GB18030 Lookup table (2-byte and 4-byte GB18030)
#
print "static Q_UINT16 const ucs_to_gb18030[28839] = {";
my $last = "0000";
foreach $uni (sort keys %u2g) {
    next if (hex($uni) <= 0x7F
	     or (hex($uni) >= 0xE000 and hex($uni) <= 0xE765));
    $gb = $u2g{$uni};
    if (length($gb) == 11) {
	$gbt = sprintf("%04X", g4_to_g4table($gb));
    } else {
	$gbt = $gb;
	$gbt =~ s/ //g;
    }
    if (hex($uni) != hex($last) + 1) {
	print "\n" unless hex($last) % 8 == 7;
	if (not defined($u2gtblranges{$uni})) {
	    die "This is not here: $uni\n";
	}
	print "  /* Contiguous area: U+", $uni,
	    " .. U+", $u2gtblranges{$uni}, " */\n";
	print "  /* U+$uni */\t";
	print "\t" x ((hex($uni) % 8));
    } else {
	if (hex($uni) % 8 == 0) {
	    print "  /* U+$uni */\t";
	}
    }
    print "0x", $gbt;
    $last = $uni;

    if (hex($uni) % 8 == 7) {
	print ",\n";
    } else {
	print ",\t";
    }
}
print "\n};\n\n";


# 
#  GB18030-to-Unicode Lookup table (2-byte GB18030)
#
$i = 0;
# 23940 - (564 + 658 + 672) = 22046
print "static Q_UINT16 const gb18030_2byte_to_ucs[22046] = {\n";
for $first (0x81..0xfe) {
    for $second (0x40..0x7e, 0x80..0xfe) {
	$gb = sprintf("%02X %02X", $first, $second);

	# Private User Area / UDA
	if ($first >= 0xaa and $first <= 0xaf and $second >= 0xa1) {
	    if ($second == 0xa1) {
		print "\t\t";
		printf "/* Skip: GB 0x%02XA1..0x%02XFE (UDA 1) */\n", $first, $first;
	    }
	} elsif ($first >= 0xf8 and $second >= 0xa1) {
	    if ($second == 0xa1) {
		print "\t\t";
		printf "/* Skip: GB 0x%02XA1..0x%02XFE (UDA 2) */\n", $first, $first;
	    }
	} elsif ($first >= 0xa1 and $first <= 0xa7 and $second <= 0xa0) {
	    if ($second == 0x40) {
		printf "\t/* Skip: GB 0x%02X40..0x%02X7E, 0x%02X80..0x%02XA0 (UDA 3) */\n",
			$first, $first, $first, $first;
	    }
	    if ($second == 0xa0) {
		printf "\t/* GB 0x%02XA1..0x%02XFE */\n\t\t",
			$first+1, $first+1;
	    }
	} else {

	    if ($second == 0x40) {
		printf "\t/* GB 0x%02X40..0x%02X7E */\n\t", $first, $first;
	    } elsif ($second == 0x80) {
		if ( ($first >= 0xaa and $first <= 0xaf) or
			 ($first >= 0xf8 and $first <= 0xfe) ) {
		    printf "\t/* GB 0x%02X80..0x%02XA0 */\n\t", $first, $first;
		} else {
		    printf "\t/* GB 0x%02X80..0x%02XFE */\n\t", $first, $first;
		}
	    }

	    $uni = $g2u{$gb};
	    print "0x", $uni;
            if ($second % 8 == 7 or $second == 0x7e or $second == 0xfe) {
		print ",\n";
		print "\t" unless ($second & 0x7e) == 0x7e;
	    } else {
		print ",\t";
	    }
	}
    }

}
print "};\n\n";

# 
#  GB18030-to-Unicode Lookup table (4-byte GB18030)
#
print "static Q_UINT16 const gb18030_4byte_to_ucs[6793] = {";
$last = 0;
foreach $gb4 (sort keys %g2u) {
    next unless length($gb4) == 11;

    $uni = $g2u{$gb4};

    $gb4lin = g4_to_lin($gb4);

    if ($gb4lin != $last + 1) {
	print "\n" unless $last % 5 == 4;
	if (not defined($g2utblranges{$gb4})) {
	    die "This is not here: $gb4\n";
	}
	print "  /* Contiguous area: GB+", $gb4,
	    " .. GB+", $g2utblranges{$gb4}, " */\n";
	print "  /* GB+$gb4 */\t";
	print "\t" x ($gb4lin % 5);
    } else {
	if ($gb4lin % 5 == 0) {
	    print "  /* GB+$gb4 */\t";
	}
    }
    print "0x", $uni;
    $last = $gb4lin;

    if ($gb4lin % 5 == 4) {
	print ",\n";
    } else {
	print ",\t";
    }
}
print "\n};\n\n";


print <<'EOT';
/*! \internal */
uint gb4lin_to_gb(uint gb4lin) {
    uchar   a, b, c, d;
    a = 0x81 + gb4lin / 12600;
    b = 0x30 + (gb4lin / 1260) % 10;
    c = 0x81 + (gb4lin / 10) % 126;
    d = 0x30 + gb4lin % 10;
    return (a << 24 | b << 16 | c << 8 | d);
}

static uint qt_Gb18030ToUnicode(const uchar *gbstr, int& len) {
    /* Returns Unicode. */
    uint    uni;
    uchar   first = *gbstr;

    if ( IsLatin(first) ) {
	len = 1;
	uni = (uint)first;
    }
    else if ( Is1stByte(first) && len >= 2 ) {
	uchar	second = gbstr[1];

	if ( Is2ndByteIn2Bytes(second) ) {
	    len = 2;

	    if (IsUDA1(first, second))
		uni = 0xE000 + (first - 0xAA) * 94 + (second - 0xA1);
	    else if (IsUDA2(first, second))
		uni = 0xE234 + (first - 0xF8) * 94 + (second - 0xA1);
	    else if (IsUDA3(first, second))
		uni = 0xE4C6 + (first - 0xA1) * 96 + (second - 0x40)
			     - ((second >= 0x80) ? 1 : 0);
	    else {
		// Use the mapping table
		uint i;

		i = (first - 0x81) * 190 + (second - 0x40) 
					 - ((second >= 0x80) ? 1 : 0);

		if (InRange(first, 0xA1, 0xA7))
		    i -= (first - 0xA0) * 96;
		if (first > 0xA7)
		    i -= 672;
		if (InRange(first, 0xAA, 0xAF))
		    i -= (first - 0xAA) * 94;
		if (first > 0xaf)
		    i -= 564;
		if (first >= 0xf8)
		    i -= (first - 0xF8) * 94;

		uni = (uint)gb18030_2byte_to_ucs[i];
	    }
	}
	else if ( Is2ndByteIn4Bytes(second) && len >= 4 ) {
	    uchar   third  = gbstr[2],
		    fourth = gbstr[3];

	    if ( Is3rdByte(third) && Is4thByte(fourth) ) {
		// Valid 4-byte GB18030, whether defined or not
		uint	    gb4lin;
		indexTbl_t  g2u;

		gb4lin = (first - 0x81) * 12600 + (second - 0x30) * 1260
			  + (third - 0x81) * 10 + (fourth - 0x30);

		len = 4;
		if (InRange(gb4lin, 0, 0x99FB)) {
		    /* GB+81308130 - GB+8431A439 */
		    g2u = gb18030_to_ucs_index[gb4lin >> 8];

		    if ((gb4lin & 0xFF) >= g2u.tblBegin &&
			(gb4lin & 0xFF) <= g2u.tblEnd) {

			uni = (uint)gb18030_4byte_to_ucs[gb4lin - g2u.tblOffset];
		    }
		    else {
			uni = g2u.algOffset + (gb4lin & 0xFF);
		    }
		} else if (InRange(gb4lin, 0x2E248, 0x12E247)) {
		    /* GB+90308130 - GB+E3329A35 */
		    uni = gb4lin - 0xE248;
		} else {
		    /* undefined or reserved area */
		    len = 1;
		    uni = QChar::replacement;
		}
	    }
	    else {
		len = 1;
		uni = QChar::replacement;
	    }
	}
	else {
	    len = 1;
	    uni = QChar::replacement;
	}
    }
    else {
	len = 1;
	uni = QChar::replacement;
    }
    return uni;
}


static int qt_UnicodeToGb18030(uint uni, uchar *gbchar) {
    /* Returns the bytesize of the GB18030 character */
    uint	gb, gb4lin;
    indexTbl_t	u2g;

    if ( IsLatin(uni) ) {
	*gbchar = (uchar)uni;
	return 1;
    }
    else if (uni <= 0xD7FF || InRange(uni, 0xE766, 0xFFFF)) {
	u2g = ucs_to_gb18030_index[uni >> 8];

	if ((uni & 0xFF) >= u2g.tblBegin && (uni & 0xFF) <= u2g.tblEnd) {
	    // Use mapping table (2-byte or 4-byte GB18030)
	    uint tblEntry;

	    tblEntry = ucs_to_gb18030[uni - u2g.tblOffset];

	    if (tblEntry > 0x8000) {
		// 2-byte GB18030
		gb = tblEntry;
	    }
	    else {
		// 4-byte GB18030 stored in a special compact format;
		uchar	a, b;
		a = 0x81;
		b = 0x30 + (tblEntry >> 11);
		if (tblEntry >= 0x7000) {
		    a += 3;
		    b -= 14;
		} else if (tblEntry >= 0x6000) {
		    a += 2;
		    b -= 6;
		} else if (tblEntry >= 0x3000) {
		    a += 1;
		    b -= 6;
		} else if (b >= 0x31) {
		    b += 5;
		}
		gbchar[0] = a;
		gbchar[1] = b;
		gbchar[2] = 0x81 + (tblEntry >> 4 & 0x7F);
		gbchar[3] = 0x30 + (tblEntry & 0xF);
		return 4;
	    }
	}
	else {
	    // Use algorithm (4-byte GB18030)
	    gb4lin = u2g.algOffset + (uni & 0xFF);
	    // Yikes, my index table could not cover all the bases...
	    if (InRange(uni, 0x49B8, 0x49FF))
		gb4lin -= 11;
	    gb = gb4lin_to_gb(gb4lin);
	}
    }
    else if (InRange(uni, 0xE000, 0xE765)) {
	// User-defined areas in GB18030 (2-byte)
	if (uni <= 0xE233)
	    gb = 0xAAA1 + (((uni - 0xE000) / 94) << 8) + (uni - 0xE000) % 94;
	else if (uni <= 0xE4C5)
	    gb = 0xF8A1 + (((uni - 0xE234) / 94) << 8) + (uni - 0xE234) % 94;
	else {
	    gb = 0xA140 + (((uni - 0xE4C6) / 96) << 8) + (uni - 0xE4C6) % 96;
	    // Skip the gap at 0x7F
	    if ((gb & 0xFF) >= 0x7F)
		gb++;
	}
    }
    else if (InRange(uni, 0x10000, 0x10FFFF)) {
	// Qt 2.3.x does not support 32-bit Unicode yet, but what the heck...
	// (U+10000 = GB+90308130) to (U+10FFFF = GB+E3329A35)
	gb = gb4lin_to_gb(0x1E248 + uni);
    }
    else {
	// Surrogate area and other undefined/reserved areas
	*gbchar = 0;
	return 0;
    }

    if (gb <= 0xFFFF) {
	gbchar[0] = (uchar)((gb >> 8) & 0xFF);
	gbchar[1] = (uchar)(gb & 0xFF);
	return 2;
    } else {
	gbchar[0] = (uchar)((gb >> 24) & 0xFF);
	gbchar[1] = (uchar)((gb >> 16) & 0xFF);
	gbchar[2] = (uchar)((gb >> 8) & 0xFF);
	gbchar[3] = (uchar)(gb & 0xFF);
	return 4;
    }
}

#endif

EOT

exit;
