From d4024c94f9ae55ebc466750678ca6bd6a47418a3 Mon Sep 17 00:00:00 2001 From: Philipp Dunkel Date: Wed, 25 Nov 2009 23:43:14 +0000 Subject: [PATCH] Added utf8_to_ascii for the new CAP form --- www/utf8_to_ascii/ChangeLog | 34 +++++++ www/utf8_to_ascii/LICENSE | 128 ++++++++++++++++++++++++ www/utf8_to_ascii/README | 40 ++++++++ www/utf8_to_ascii/utf8_to_ascii.php | 145 ++++++++++++++++++++++++++++ 4 files changed, 347 insertions(+) create mode 100644 www/utf8_to_ascii/ChangeLog create mode 100644 www/utf8_to_ascii/LICENSE create mode 100644 www/utf8_to_ascii/README create mode 100644 www/utf8_to_ascii/utf8_to_ascii.php diff --git a/www/utf8_to_ascii/ChangeLog b/www/utf8_to_ascii/ChangeLog new file mode 100644 index 0000000..1d3f356 --- /dev/null +++ b/www/utf8_to_ascii/ChangeLog @@ -0,0 +1,34 @@ +2006-10-25 17:12 harryf + + * README: More detail / notes + +2006-10-25 15:34 harryf + + * tests/index.php: Add simple unit test + +2006-10-25 15:34 harryf + + * utf8_to_ascii.php: Move from global to local static variable + +2006-09-04 23:34 harryf + + * builddb.pl: Fix example script name + +2006-09-04 23:34 harryf + + * utf8_to_ascii.php: Performance optimization / re-write - elimate + string splitting with preg_match_all - most of the time was going + here - re-implement ord calculation - reduce #calls to PHP ord() + fn. Improve ab average response time for the example from ~9s to + ~0.41s + +2006-03-04 00:43 harryf + + * LICENSE, README, builddb.pl, utf8_to_ascii.php, tests/index.php, + tests/data/utf8.txt: Initial import + +2006-03-04 00:43 harryf + + * LICENSE, README, builddb.pl, utf8_to_ascii.php, tests/index.php, + tests/data/utf8.txt: Initial revision + diff --git a/www/utf8_to_ascii/LICENSE b/www/utf8_to_ascii/LICENSE new file mode 100644 index 0000000..886394c --- /dev/null +++ b/www/utf8_to_ascii/LICENSE @@ -0,0 +1,128 @@ + The "Artistic License" + + Preamble + +The intent of this document is to state the conditions under which a +Package may be copied, such that the Copyright Holder maintains some +semblance of artistic control over the development of the package, +while giving the users of the package the right to use and distribute +the Package in a more-or-less customary fashion, plus the right to make +reasonable modifications. + +Definitions: + + "Package" refers to the collection of files distributed by the + Copyright Holder, and derivatives of that collection of files + created through textual modification. + + "Standard Version" refers to such a Package if it has not been + modified, or has been modified in accordance with the wishes + of the Copyright Holder as specified below. + + "Copyright Holder" is whoever is named in the copyright or + copyrights for the package. + + "You" is you, if you're thinking about copying or distributing + this Package. + + "Reasonable copying fee" is whatever you can justify on the + basis of media cost, duplication charges, time of people involved, + and so on. (You will not be required to justify it to the + Copyright Holder, but only to the computing community at large + as a market that must bear the fee.) + + "Freely Available" means that no fee is charged for the item + itself, though there may be fees involved in handling the item. + It also means that recipients of the item may redistribute it + under the same conditions they received it. + +1. You may make and give away verbatim copies of the source form of the +Standard Version of this Package without restriction, provided that you +duplicate all of the original copyright notices and associated disclaimers. + +2. You may apply bug fixes, portability fixes and other modifications +derived from the Public Domain or from the Copyright Holder. A Package +modified in such a way shall still be considered the Standard Version. + +3. You may otherwise modify your copy of this Package in any way, provided +that you insert a prominent notice in each changed file stating how and +when you changed that file, and provided that you do at least ONE of the +following: + + a) place your modifications in the Public Domain or otherwise make them + Freely Available, such as by posting said modifications to Usenet or + an equivalent medium, or placing the modifications on a major archive + site such as uunet.uu.net, or by allowing the Copyright Holder to include + your modifications in the Standard Version of the Package. + + b) use the modified Package only within your corporation or organization. + + c) rename any non-standard executables so the names do not conflict + with standard executables, which must also be provided, and provide + a separate manual page for each non-standard executable that clearly + documents how it differs from the Standard Version. + + d) make other distribution arrangements with the Copyright Holder. + +4. You may distribute the programs of this Package in object code or +executable form, provided that you do at least ONE of the following: + + a) distribute a Standard Version of the executables and library files, + together with instructions (in the manual page or equivalent) on where + to get the Standard Version. + + b) accompany the distribution with the machine-readable source of + the Package with your modifications. + + c) give non-standard executables non-standard names, and clearly + document the differences in manual pages (or equivalent), together + with instructions on where to get the Standard Version. + + d) make other distribution arrangements with the Copyright Holder. + +5. You may charge a reasonable copying fee for any distribution of this +Package. You may charge any fee you choose for support of this +Package. You may not charge a fee for this Package itself. However, +you may distribute this Package in aggregate with other (possibly +commercial) programs as part of a larger (possibly commercial) software +distribution provided that you do not advertise this Package as a +product of your own. You may embed this Package's interpreter within +an executable of yours (by linking); this shall be construed as a mere +form of aggregation, provided that the complete Standard Version of the +interpreter is so embedded. + +6. The scripts and library files supplied as input to or produced as +output from the programs of this Package do not automatically fall +under the copyright of this Package, but belong to whoever generated +them, and may be sold commercially, and may be aggregated with this +Package. If such scripts or library files are aggregated with this +Package via the so-called "undump" or "unexec" methods of producing a +binary executable image, then distribution of such an image shall +neither be construed as a distribution of this Package nor shall it +fall under the restrictions of Paragraphs 3 and 4, provided that you do +not represent such an executable image as a Standard Version of this +Package. + +7. C subroutines (or comparably compiled subroutines in other +languages) supplied by you and linked into this Package in order to +emulate subroutines and variables of the language defined by this +Package shall not be considered part of this Package, but are the +equivalent of input as in Paragraph 6, provided these subroutines do +not change the language in any way that would cause it to fail the +regression tests for the language. + +8. Aggregation of this Package with a commercial distribution is always +permitted provided that the use of this Package is embedded; that is, +when no overt attempt is made to make this Package's interfaces visible +to the end user of the commercial distribution. Such use shall not be +construed as a distribution of this Package. + +9. The name of the Copyright Holder may not be used to endorse or promote +products derived from this software without specific prior written permission. + +10. THIS PACKAGE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR +IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED +WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. + + The End + diff --git a/www/utf8_to_ascii/README b/www/utf8_to_ascii/README new file mode 100644 index 0000000..df4f751 --- /dev/null +++ b/www/utf8_to_ascii/README @@ -0,0 +1,40 @@ +UTF8 TO ASCII + +US-ASCII transliterations of Unicode text + +Ported Sean M. Burke's Text::Unidecode Perl module + +http://search.cpan.org/~sburke/Text-Unidecode-0.04/ +http://interglacial.com/~sburke/ + +Use is simple; + + + +Some notes; + +- Make sure you provide is well-formed UTF-8! +http://phputf8.sourceforge.net/#UTF_8_Validation_and_Cleaning + +- For European languages, it should replace Unicode character +with corresponding ascii characters and produce a readable +result. For other languages, the results will be less +meaningful - it's a "dumb" character for character replacement +True trasliteration is a little more complex than this; +See: http://en.wikipedia.org/wiki/Transliteration + +- For any characters for which there's no replacement +character available, a (default) '?' will be inserted. The second +argument can be used to define an alternative replacement char + +- Don't panic about all the files in the db subdirectory - they +are not all loaded at once - in fact they are only loaded if they +are needed to convert a given character (i.e. which files get +loaded depends on the input) + +For a little more see; +http://www.sitepoint.com/blogs/2006/03/03/us-ascii-transliterations-of-unicode-text/ diff --git a/www/utf8_to_ascii/utf8_to_ascii.php b/www/utf8_to_ascii/utf8_to_ascii.php new file mode 100644 index 0000000..243bdc3 --- /dev/null +++ b/www/utf8_to_ascii/utf8_to_ascii.php @@ -0,0 +1,145 @@ += 0 && $ord0 <= 127 ) { + + $ord = $ord0; + $increment = 1; + + } else { + + # 2 bytes + $ord1 = ord($str{$i+1}); + + if ( $ord0 >= 192 && $ord0 <= 223 ) { + + $ord = ( $ord0 - 192 ) * 64 + ( $ord1 - 128 ); + $increment = 2; + + } else { + + # 3 bytes + $ord2 = ord($str{$i+2}); + + if ( $ord0 >= 224 && $ord0 <= 239 ) { + + $ord = ($ord0-224)*4096 + ($ord1-128)*64 + ($ord2-128); + $increment = 3; + + } else { + + # 4 bytes + $ord3 = ord($str{$i+3}); + + if ($ord0>=240 && $ord0<=247) { + + $ord = ($ord0-240)*262144 + ($ord1-128)*4096 + + ($ord2-128)*64 + ($ord3-128); + $increment = 4; + + } else { + + ob_end_clean(); + trigger_error("utf8_to_ascii: looks like badly formed UTF-8 at byte $i"); + return FALSE; + + } + + } + + } + + } + + $bank = $ord >> 8; + + # If we haven't used anything from this bank before, need to load it... + if ( !array_key_exists($bank, $UTF8_TO_ASCII) ) { + + $bankfile = UTF8_TO_ASCII_DB. '/'. sprintf("x%02x",$bank).'.php'; + + if ( file_exists($bankfile) ) { + + # Load the appropriate database + if ( !include $bankfile ) { + ob_end_clean(); + trigger_error("utf8_to_ascii: unable to load $bankfile"); + } + + } else { + + # Some banks are deliberately empty + $UTF8_TO_ASCII[$bank] = array(); + + } + } + + $newchar = $ord & 255; + + if ( array_key_exists($newchar, $UTF8_TO_ASCII[$bank]) ) { + echo $UTF8_TO_ASCII[$bank][$newchar]; + } else { + echo $unknown; + } + + $i += $increment; + + } + + $str = ob_get_contents(); + ob_end_clean(); + return $str; + +}