Talk:Western Latin character sets (computing)

From Wikipedia, the free encyclopedia

[edit] code to generate table

this code was written using delphi but should also build and run with freepascal. readtxt.pas can be obtained from the bewareserv source. the mappings come from the following locations.

as raw statements of facts about encodings that a huge amount of software uses i do not belive the raw mappings from theese can be considered eligable for copyright.

program charsetcomparisongen;

uses
  sysutils,readtxt; //we use our own text reader as the delphi one can't handle
                    //unix format text
const
  maxcharset =5;
var
  buildarray : array[0..65535,0..maxcharset] of smallint;
  names : array[0..31] of string;
procedure processcharset(name:string;number:byte;filename:string);
var
  t: treadtext;
  line:string;
  i,j:integer;
begin
  names[number] := name;

  readtext_init(t,filename);
  repeat
    line := readtext_line(t);
    if (length(line)>=11) and (line[2]='x') and (line[7]='x') then begin;
      //writeln('processing line '+line);
      buildarray[strtoint('$'+copy(line,8,4)),number] := strtoint('$'+copy(line,3,2));
    end;
  until readtext_eof(t);
end;
var
  t: textfile;
  i,j : integer;
  firstline : boolean;
  goodline : boolean;
  rowcounter : integer;
begin
  for i := 0 to 65535 do for j := 0 to maxcharset do buildarray[i,j] := -1;
  processcharset('ISO-8859-1',0,'8859-1.txt');
  processcharset('ISO-8859-15',1,'8859-15.txt');
  processcharset('WINDOWS-1252',2,'CP1252.txt');
  processcharset('IBM437',3,'CP437.txt');
  processcharset('IBM850',4,'CP850.txt');
  processcharset('Mac-Roman encoding|MACINTOSH',5,'ROMAN.txt');

  assignfile(t,'output.txt');
  rewrite(t);
  writeln(t,'<table {{subst:prettytable}}>');

  writeln(t,'</tr>');
  firstline := true;
  rowcounter := 0;
  for i := $80 to 65535 do begin
    goodline := false;
    for j := 0 to maxcharset do begin
      if buildarray[i,j]<>-1 then goodline := true;
    end;
    if goodline then begin
      if (rowcounter and $F) = 0 then begin
        write(t,'<tr><td>character<td>Codepoint');
        for j := 0 to maxcharset do begin;
          write(t,'<td>[['+names[j]+']]');
        end;
      end;
      inc(rowcounter);
      write(t,'<tr><td>');
      case i of
        $00 : write(t,'[[NUL]]');
        $01 : write(t,'[[SOH]]');
        $02 : write(t,'[[STX]]');
        $03 : write(t,'[[ETX]]');
        $04 : write(t,'[[EOT]]');
        $05 : write(t,'[[ENQ]]');
        $06 : write(t,'[[ACK]]');
        $07 : write(t,'[[BEL]]');
        $08 : write(t,'[[BS]]');
        $09 : write(t,'[[TAB]]');
        $0A : write(t,'[[LF]]');
        $0B : write(t,'[[VT]]');
        $0C : write(t,'[[FF]]');
        $0D : write(t,'[[CR]]');
        $0E : write(t,'[[SO]]');
        $0F : write(t,'[[SI]]');

        $10 : write(t,'[[DLE]]');
        $11 : write(t,'[[DC1]]');
        $12 : write(t,'[[DC2]]');
        $13 : write(t,'[[DC3]]');
        $14 : write(t,'[[DC4]]');
        $15 : write(t,'[[NAK]]');
        $16 : write(t,'[[SYN]]');
        $17 : write(t,'[[ETB]]');
        $18 : write(t,'[[CAN]]');
        $19 : write(t,'[[EM]]');
        $1A : write(t,'[[SUB]]');
        $1B : write(t,'[[ESC]]');
        $1C : write(t,'[[FS]]');
        $1D : write(t,'[[GS]]');
        $1E : write(t,'[[RS]]');
        $1F : write(t,'[[US]]');

        $80 : write(t,'[[PAD]]');
        $81 : write(t,'[[HOP]]');
        $82 : write(t,'[[BPH]]');
        $83 : write(t,'[[NBH]]');
        $84 : write(t,'[[IND]]');
        $85 : write(t,'[[NEL]]');
        $86 : write(t,'[[SSA]]');
        $87 : write(t,'[[ESA]]');
        $88 : write(t,'[[HTS]]');
        $89 : write(t,'[[HTJ]]');
        $8A : write(t,'[[VTS]]');
        $8B : write(t,'[[PLD]]');
        $8C : write(t,'[[PLU]]');
        $8D : write(t,'[[RI]]');
        $8E : write(t,'[[SS2]]');
        $8F : write(t,'[[SS3]]');

        $90 : write(t,'[[DCS]]');
        $91 : write(t,'[[PU1]]');
        $92 : write(t,'[[PU2]]');
        $93 : write(t,'[[STS]]');
        $94 : write(t,'[[CCH]]');
        $95 : write(t,'[[MW]]');
        $96 : write(t,'[[SPA]]');
        $97 : write(t,'[[EPA]]');
        $98 : write(t,'[[SOS]]');
        $99 : write(t,'[[SGCI]]');
        $9A : write(t,'[[SCI]]');
        $9B : write(t,'[[CSI]]');
        $9C : write(t,'[[ST]]');
        $9D : write(t,'[[OSC]]');
        $9E : write(t,'[[PM]]');
        $9F : write(t,'[[APC]]');

        $A0 : write(t,'[[NBSP]]');
        $AD : write(t,'[[SHY]]');


        else write(t,'[[&#x'+inttohex(i,4)+';]]');
      end;
      //if firstline then begin
      //  firstline := false;
      //  write(t,'<td>{{uplusfirst}}'+inttohex(i,4));
      //end else begin
        write(t,'<td>U+'+inttohex(i,4));
      //end;
      for j := 0 to maxcharset do begin
        if buildarray[i,j]=-1 then begin
          write(t,'<td bgcolor=red> ')
        end else begin
          write(t,'<td bgcolor=lightgreen>'+inttohex(buildarray[i,j],2));
        end;
      end;
      writeln(t,'</tr>');


    end;
  end;
  writeln(t,'</table>');
  closefile(t);
  //for counter := 0 to 65535 do begin;

end.

[edit] Language represented

Gleaned from Czyborra.com. Can anyone fill in the blanks? Michael Z. 2005-07-8 20:25 Z

ASCII Latin, Swahili, Hawaiian and American English
ISO-8859-1 French (fr), Spanish (es), Catalan (ca), Basque (eu), Portuguese (pt), Italian (it), Albanian (sq), Rhaeto-Romanic (rm), Dutch (nl), German (de), Danish (da), Swedish (sv), Norwegian (no), Finnish (fi), Faroese (fo), Icelandic (is), Irish (ga), Scottish (gd), and English (en), incidentally also Afrikaans (af) and Swahili (with some tolerable omissions for Dutch, French, and German).
ISO-8859-15 ISO-8859-1 plus some forgotten French and Finnish letters.
WINDOWS-1252 ISO-8859-1 plus [?]
IBM437
IBM850 Presents Latin-1 repertoire in code positions compatible with IBM437's line-drawing characters.
MACINTOSH "This character set is used for at least the following Mac OS localizations: U.S., British, Canadian French, French, Swiss French, German, Swiss German, Italian, Swiss Italian, Dutch, Swedish, Norwegian, Danish, Finnish, Spanish, Catalan, Portuguese, Brazilian, and the default International system."


ok i've put a modified version of thas table below
ASCII Latin, Swahili, Hawaiian and English
ISO-8859-1 French (fr), Spanish (es), Catalan (ca), Basque (eu), Portuguese (pt), Italian (it), Albanian (sq), Rhaeto-Romanic (rm), Dutch (nl), German (de), Danish (da), Swedish (sv), Norwegian (no), Finnish (fi), Faroese (fo), Icelandic (is), Irish (ga), Scottish (gd), and English (en), incidentally also Afrikaans (af) and Swahili (with some tolerable omissions for Dutch, French, and German).
ISO-8859-15 ISO-8859-1 plus some forgotten French and Finnish letters and the euro sign minus vulgar fractions the generic currency sign and some letter free diacritics.
WINDOWS-1252 ISO-8859-1 minus the rarely used C1 control codes, plus everything from ISO-8859-15 (in different posistions), curved quotes various symbols and proper dashes.
IBM437 ASCII plus a lot of graphics characters some letters with diacritics but nowhere near as many as the other encodings here and some basic greek letters.
IBM850 All printable characters from ISO-8859-1 plus some graphics characters and some other bits and peices in code positions such that characters shared with IBM437 are in the same place as in IBM437.
MACINTOSH "This character set is used for at least the following Mac OS localizations: U.S., British, Canadian French, French, Swiss French, German, Swiss German, Italian, Swiss Italian, Dutch, Swedish, Norwegian, Danish, Finnish, Spanish, Catalan, Portuguese, Brazilian, and the default International system."

[edit] Representation of Western European languages

Please don't edit the introduction to make it sound like ASCII is sufficient to represent non-English languages, and language-specific letters are a bonus. Either an encoding has the letters used in a language, or it doesn't. Imagine if you had to write English without the letters C, Q, and W, and had to substitude S, K, and "UU" for them—you would kuikly deside that this sukked, and undoubtedly kreate your own English-language enkoding by the end of the uueek. Michael Z. 2005-07-9 07:19 Z