Compas Pascal: Delphi 2009 strings explained by example

This code snippet explains by example how the new string types work:

type
  OemCp437=type ansistring(437);
  CyrillicString=type ansistring(1251);
  DanishString=type ansistring(1252);
  GreekString=type ansistring(1253);
  usascii=type ansistring(20127);
  Iso88591String=type ansistring(28591);
  Iso885915String=type ansistring(28605);
  utf7string=type ansistring(65000);

  // These will not work, but will compile
  utf16le_string=type ansistring(1200);
  utf16be_string=type ansistring(1201);
  utf32_string=type ansistring(12000);
  utf32be_string=type ansistring(12001);

procedure TForm3.Button1Click(Sender: TObject);
var
  utf16:string;
  local:ansistring;
  raw:rawbytestring;
  utf8:utf8string;
  utf7:utf7string;
  cyrillic:CyrillicString;
  danish:DanishString;
  greek:GreekString;
  iso88591:Iso88591String;
  iso885915:Iso885915String;
  Cp437:OemCp437;
  ascii:usascii;
  utf32:utf32_string;
begin
  // Ansistring cannot be used for utf16 and utf32
  utf32:='asdf';
  Assert (utf32='');

  // Demonstrating what UTF-16 is
  utf16:=#$1D160;            // This is a musical note (000011101000101100000), see http://unicode.org/charts/PDF/U1D100.pdf
  Assert (length(utf16)=2);  // This character occupies 2 positions in UTF-16
  Assert (utf16[1]=#$D834);  // 110110 0000110100 First half of the symbol
  Assert (utf16[2]=#$DD60);  // 110111 0101100000 Second half of the symbol
  utf8:=utf16;
  Assert (length(utf8)=4);
  Assert (utf8[1]=#$F0);   // 11110 000
  Assert (utf8[2]=#$9D);   // 10 011101
  Assert (utf8[3]=#$85);   // 10 000101
  Assert (utf8[4]=#$A0);   // 10 100000
  danish:=utf16;
  Assert (danish='??');    // Note how Windows incorrectly converts to two letters!
  Assert (length(danish)=2);
  danish:=utf8;
  Assert (danish='??');    // Note how Windows incorrectly converts to two letters!
  Assert (length(danish)=2);

  // Demonstrating the euro character
  utf16:='€';
  danish:=utf16;
  cyrillic:=utf16;
  greek:=utf16;
  iso88591:=utf16;
  iso885915:=utf16;
  Cp437:=utf16;
  ascii:=utf16;
  utf8:=utf16;
  utf7:=utf16;
  Assert (length(utf16)=1);
  Assert (length(danish)=1);
  Assert (length(cyrillic)=1);
  Assert (length(greek)=1);
  Assert (length(iso88591)=1);
  Assert (length(iso885915)=1);
  Assert (length(Cp437)=1);
  Assert (length(ascii)=1);
  Assert (length(utf7)=5);
  Assert (length(utf8)=3);
  Assert (ord(utf16[1])=8364);
  Assert (ord(danish[1])=128);
  Assert (ord(cyrillic[1])=136);
  Assert (ord(greek[1])=128);
  Assert (ord(iso885915[1])=164);
  Assert (iso88591='?');
  Assert (ascii='?');
  Assert (Cp437='?');
  Assert (greek=utf16);
  Assert (danish=utf16);
  Assert (cyrillic=utf16);
  Assert (utf7=utf16);
  Assert (utf7=utf8);
  Assert (iso885915=utf16);
  Assert (iso88591<>utf16);
  Assert (Cp437<>utf16);
  Assert (ascii<>utf16);
  Assert (cyrillic=danish);

  // Convert from Unicode to special character sets
  utf16:='abc ÆØÅ рыба'; // s uses utf-16
  local:=utf16;  // Converts to local 8-bit character set
  raw:=utf16;    // Converts to local 8-bit character set
  utf8:=utf16;   // Converts to utf-8
  cyrillic:=utf16;
  danish:=utf16;
  greek:=utf16;
  Cp437:=utf16;
  ascii:=utf16;
  utf7:=utf16;
  Assert (cyrillic='abc ?OA рыба');
  Assert (danish='abc ÆØÅ ????');
  Assert (greek='abc ?OA ????');
  Assert (greek='abc ?OA ????');   // Æ => ?
  Assert (Cp437='abc ÆOÅ ????');   // Ø does not exist
  Assert (ascii='abc AOA ????');   // Æ => A
  Assert (length(utf16)=12);
  Assert (length(local)=12);
  Assert (length(raw)=12);
  Assert (length(utf8)=19);
  Assert (length(utf7)=28);
  Assert (length(Cp437)=12);
  Assert (length(cyrillic)=12);
  Assert (length(danish)=12);
  Assert (length(greek)=12);
  Assert (length(ascii)=12);

  // Converts to Unicode
  utf16:=danish;
  Assert (utf16='abc ÆØÅ ????');
  Assert (length(utf16)=12);
  utf16:=cyrillic;
  Assert (utf16='abc ?OA рыба');
  Assert (length(utf16)=12);
  utf16:=utf8;
  Assert (utf16='abc ÆØÅ рыба');
  Assert (length(utf16)=12);

  // The following lines only work correctly if your local character set
  // is Windows-1252!
  utf16:=raw;
  Assert (utf16='abc ÆØÅ ????');
  Assert (length(utf16)=12);

  raw:=cyrillic;
  local:=cyrillic;
  Assert (local='abc ?OA ????');
  Assert (raw<>local);   // raw preserves cyrillic letters and the character set
  Assert (length(raw)=12);

  raw:=danish;
  local:=danish;
  Assert (raw=local);
  Assert (raw='abc ÆØÅ ????');
  Assert (local='abc ÆØÅ ????');
  Assert (length(raw)=12);

  raw:=greek;
  local:=greek;
  Assert (raw='abc ?OA ????');
  Assert (local='abc ?OA ????');
  Assert (raw=local); // This is only true because the string doesn't contain greek letters
  Assert (length(raw)=12);
end;

If you are in doubt about how to use ansistring and RawByteString, use this guideline:

* Use the normal (unicode) string type as much as you can.
* Use ansistring for texts in local 8-bit character sets. Usually it is only used for I/O.
* Use RawByteString for parameters to functions that have to work on all kinds of ansistrings, without triggering character set conversions, like I/O functions. This is really only necessary if you mix various character sets, which is rarely the case. Most programmers will only very rarely use RawByteString.
* Use RawByteString for storing binary data - but ansistring also works. Make sure that you don't assign binary data to/from UnicodeString=string. Note that most string manipulation functions now expect the unicode string type, so you may need to implement some things yourself.

If you want to make code work with both Delphi 2009 and previous, you can insert this into your source:

{$ifndef UNICODE}
type UnicodeString=widestring;
type RawByteString=ansistring;
{$endif}

Use UnicodeString wherever you used widestring before, unless it's really widestring that you want to use (for BSTR compatibility). Program the rest using string wherever you can, and ansistring in some I/O operations. Most of the VCL already defaults to ansistring for non-Unicode I/O, making things very backwards compatible.

Compas Pascal

Wednesday, 8 October 2008

Delphi 2009 strings explained by example

1 comment: