Added utf8_to_ascii for the new CAP form
This commit is contained in:
parent
d74d29313e
commit
d4024c94f9
4 changed files with 347 additions and 0 deletions
34
www/utf8_to_ascii/ChangeLog
Normal file
34
www/utf8_to_ascii/ChangeLog
Normal file
|
@ -0,0 +1,34 @@
|
|||
2006-10-25 17:12 harryf
|
||||
|
||||
* README: More detail / notes
|
||||
|
||||
2006-10-25 15:34 harryf
|
||||
|
||||
* tests/index.php: Add simple unit test
|
||||
|
||||
2006-10-25 15:34 harryf
|
||||
|
||||
* utf8_to_ascii.php: Move from global to local static variable
|
||||
|
||||
2006-09-04 23:34 harryf
|
||||
|
||||
* builddb.pl: Fix example script name
|
||||
|
||||
2006-09-04 23:34 harryf
|
||||
|
||||
* utf8_to_ascii.php: Performance optimization / re-write - elimate
|
||||
string splitting with preg_match_all - most of the time was going
|
||||
here - re-implement ord calculation - reduce #calls to PHP ord()
|
||||
fn. Improve ab average response time for the example from ~9s to
|
||||
~0.41s
|
||||
|
||||
2006-03-04 00:43 harryf
|
||||
|
||||
* LICENSE, README, builddb.pl, utf8_to_ascii.php, tests/index.php,
|
||||
tests/data/utf8.txt: Initial import
|
||||
|
||||
2006-03-04 00:43 harryf
|
||||
|
||||
* LICENSE, README, builddb.pl, utf8_to_ascii.php, tests/index.php,
|
||||
tests/data/utf8.txt: Initial revision
|
||||
|
128
www/utf8_to_ascii/LICENSE
Normal file
128
www/utf8_to_ascii/LICENSE
Normal file
|
@ -0,0 +1,128 @@
|
|||
The "Artistic License"
|
||||
|
||||
Preamble
|
||||
|
||||
The intent of this document is to state the conditions under which a
|
||||
Package may be copied, such that the Copyright Holder maintains some
|
||||
semblance of artistic control over the development of the package,
|
||||
while giving the users of the package the right to use and distribute
|
||||
the Package in a more-or-less customary fashion, plus the right to make
|
||||
reasonable modifications.
|
||||
|
||||
Definitions:
|
||||
|
||||
"Package" refers to the collection of files distributed by the
|
||||
Copyright Holder, and derivatives of that collection of files
|
||||
created through textual modification.
|
||||
|
||||
"Standard Version" refers to such a Package if it has not been
|
||||
modified, or has been modified in accordance with the wishes
|
||||
of the Copyright Holder as specified below.
|
||||
|
||||
"Copyright Holder" is whoever is named in the copyright or
|
||||
copyrights for the package.
|
||||
|
||||
"You" is you, if you're thinking about copying or distributing
|
||||
this Package.
|
||||
|
||||
"Reasonable copying fee" is whatever you can justify on the
|
||||
basis of media cost, duplication charges, time of people involved,
|
||||
and so on. (You will not be required to justify it to the
|
||||
Copyright Holder, but only to the computing community at large
|
||||
as a market that must bear the fee.)
|
||||
|
||||
"Freely Available" means that no fee is charged for the item
|
||||
itself, though there may be fees involved in handling the item.
|
||||
It also means that recipients of the item may redistribute it
|
||||
under the same conditions they received it.
|
||||
|
||||
1. You may make and give away verbatim copies of the source form of the
|
||||
Standard Version of this Package without restriction, provided that you
|
||||
duplicate all of the original copyright notices and associated disclaimers.
|
||||
|
||||
2. You may apply bug fixes, portability fixes and other modifications
|
||||
derived from the Public Domain or from the Copyright Holder. A Package
|
||||
modified in such a way shall still be considered the Standard Version.
|
||||
|
||||
3. You may otherwise modify your copy of this Package in any way, provided
|
||||
that you insert a prominent notice in each changed file stating how and
|
||||
when you changed that file, and provided that you do at least ONE of the
|
||||
following:
|
||||
|
||||
a) place your modifications in the Public Domain or otherwise make them
|
||||
Freely Available, such as by posting said modifications to Usenet or
|
||||
an equivalent medium, or placing the modifications on a major archive
|
||||
site such as uunet.uu.net, or by allowing the Copyright Holder to include
|
||||
your modifications in the Standard Version of the Package.
|
||||
|
||||
b) use the modified Package only within your corporation or organization.
|
||||
|
||||
c) rename any non-standard executables so the names do not conflict
|
||||
with standard executables, which must also be provided, and provide
|
||||
a separate manual page for each non-standard executable that clearly
|
||||
documents how it differs from the Standard Version.
|
||||
|
||||
d) make other distribution arrangements with the Copyright Holder.
|
||||
|
||||
4. You may distribute the programs of this Package in object code or
|
||||
executable form, provided that you do at least ONE of the following:
|
||||
|
||||
a) distribute a Standard Version of the executables and library files,
|
||||
together with instructions (in the manual page or equivalent) on where
|
||||
to get the Standard Version.
|
||||
|
||||
b) accompany the distribution with the machine-readable source of
|
||||
the Package with your modifications.
|
||||
|
||||
c) give non-standard executables non-standard names, and clearly
|
||||
document the differences in manual pages (or equivalent), together
|
||||
with instructions on where to get the Standard Version.
|
||||
|
||||
d) make other distribution arrangements with the Copyright Holder.
|
||||
|
||||
5. You may charge a reasonable copying fee for any distribution of this
|
||||
Package. You may charge any fee you choose for support of this
|
||||
Package. You may not charge a fee for this Package itself. However,
|
||||
you may distribute this Package in aggregate with other (possibly
|
||||
commercial) programs as part of a larger (possibly commercial) software
|
||||
distribution provided that you do not advertise this Package as a
|
||||
product of your own. You may embed this Package's interpreter within
|
||||
an executable of yours (by linking); this shall be construed as a mere
|
||||
form of aggregation, provided that the complete Standard Version of the
|
||||
interpreter is so embedded.
|
||||
|
||||
6. The scripts and library files supplied as input to or produced as
|
||||
output from the programs of this Package do not automatically fall
|
||||
under the copyright of this Package, but belong to whoever generated
|
||||
them, and may be sold commercially, and may be aggregated with this
|
||||
Package. If such scripts or library files are aggregated with this
|
||||
Package via the so-called "undump" or "unexec" methods of producing a
|
||||
binary executable image, then distribution of such an image shall
|
||||
neither be construed as a distribution of this Package nor shall it
|
||||
fall under the restrictions of Paragraphs 3 and 4, provided that you do
|
||||
not represent such an executable image as a Standard Version of this
|
||||
Package.
|
||||
|
||||
7. C subroutines (or comparably compiled subroutines in other
|
||||
languages) supplied by you and linked into this Package in order to
|
||||
emulate subroutines and variables of the language defined by this
|
||||
Package shall not be considered part of this Package, but are the
|
||||
equivalent of input as in Paragraph 6, provided these subroutines do
|
||||
not change the language in any way that would cause it to fail the
|
||||
regression tests for the language.
|
||||
|
||||
8. Aggregation of this Package with a commercial distribution is always
|
||||
permitted provided that the use of this Package is embedded; that is,
|
||||
when no overt attempt is made to make this Package's interfaces visible
|
||||
to the end user of the commercial distribution. Such use shall not be
|
||||
construed as a distribution of this Package.
|
||||
|
||||
9. The name of the Copyright Holder may not be used to endorse or promote
|
||||
products derived from this software without specific prior written permission.
|
||||
|
||||
10. THIS PACKAGE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR
|
||||
IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED
|
||||
WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE.
|
||||
|
||||
The End
|
||||
|
40
www/utf8_to_ascii/README
Normal file
40
www/utf8_to_ascii/README
Normal file
|
@ -0,0 +1,40 @@
|
|||
UTF8 TO ASCII
|
||||
|
||||
US-ASCII transliterations of Unicode text
|
||||
|
||||
Ported Sean M. Burke's Text::Unidecode Perl module
|
||||
|
||||
http://search.cpan.org/~sburke/Text-Unidecode-0.04/
|
||||
http://interglacial.com/~sburke/
|
||||
|
||||
Use is simple;
|
||||
|
||||
<?php
|
||||
require_once '/path/to/utf8_to_ascii/utf8_to_ascii.php';
|
||||
$utf8 = file_get_contents('/tmp/someutf8.txt');
|
||||
$ascii = utf8_to_ascii($utf8);
|
||||
?>
|
||||
|
||||
Some notes;
|
||||
|
||||
- Make sure you provide is well-formed UTF-8!
|
||||
http://phputf8.sourceforge.net/#UTF_8_Validation_and_Cleaning
|
||||
|
||||
- For European languages, it should replace Unicode character
|
||||
with corresponding ascii characters and produce a readable
|
||||
result. For other languages, the results will be less
|
||||
meaningful - it's a "dumb" character for character replacement
|
||||
True trasliteration is a little more complex than this;
|
||||
See: http://en.wikipedia.org/wiki/Transliteration
|
||||
|
||||
- For any characters for which there's no replacement
|
||||
character available, a (default) '?' will be inserted. The second
|
||||
argument can be used to define an alternative replacement char
|
||||
|
||||
- Don't panic about all the files in the db subdirectory - they
|
||||
are not all loaded at once - in fact they are only loaded if they
|
||||
are needed to convert a given character (i.e. which files get
|
||||
loaded depends on the input)
|
||||
|
||||
For a little more see;
|
||||
http://www.sitepoint.com/blogs/2006/03/03/us-ascii-transliterations-of-unicode-text/
|
145
www/utf8_to_ascii/utf8_to_ascii.php
Normal file
145
www/utf8_to_ascii/utf8_to_ascii.php
Normal file
|
@ -0,0 +1,145 @@
|
|||
<?php
|
||||
/**
|
||||
* US-ASCII transliterations of Unicode text
|
||||
* @version $Id: utf8_to_ascii.php,v 1.1 2009/11/25 23:43:14 philipp Exp $
|
||||
* @package utf8_to_ascii
|
||||
*/
|
||||
|
||||
if ( !defined('UTF8_TO_ASCII_DB') ) {
|
||||
define('UTF8_TO_ASCII_DB',dirname(__FILE__).'/db');
|
||||
}
|
||||
|
||||
//--------------------------------------------------------------------
|
||||
/**
|
||||
* US-ASCII transliterations of Unicode text
|
||||
* Ported Sean M. Burke's Text::Unidecode Perl module (He did all the hard work!)
|
||||
* Warning: you should only pass this well formed UTF-8!
|
||||
* Be aware it works by making a copy of the input string which it appends transliterated
|
||||
* characters to - it uses a PHP output buffer to do this - it means, memory use will increase,
|
||||
* requiring up to the same amount again as the input string
|
||||
* @see http://search.cpan.org/~sburke/Text-Unidecode-0.04/lib/Text/Unidecode.pm
|
||||
* @param string UTF-8 string to convert
|
||||
* @param string (default = ?) Character use if character unknown
|
||||
* @return string US-ASCII string
|
||||
* @package utf8_to_ascii
|
||||
*/
|
||||
function utf8_to_ascii($str, $unknown = '?') {
|
||||
|
||||
# The database for transliteration stored here
|
||||
static $UTF8_TO_ASCII = array();
|
||||
|
||||
# Variable lookups faster than accessing constants
|
||||
$UTF8_TO_ASCII_DB = UTF8_TO_ASCII_DB;
|
||||
|
||||
if ( strlen($str) == 0 ) { return ''; }
|
||||
|
||||
$len = strlen($str);
|
||||
$i = 0;
|
||||
|
||||
# Use an output buffer to copy the transliterated string
|
||||
# This is done for performance vs. string concatenation - on my system, drops
|
||||
# the average request time for the example from ~0.46ms to 0.41ms
|
||||
# See http://phplens.com/lens/php-book/optimizing-debugging-php.php
|
||||
# Section "High Return Code Optimizations"
|
||||
ob_start();
|
||||
|
||||
while ( $i < $len ) {
|
||||
|
||||
$ord = NULL;
|
||||
$increment = 1;
|
||||
|
||||
$ord0 = ord($str{$i});
|
||||
|
||||
# Much nested if /else - PHP fn calls expensive, no block scope...
|
||||
|
||||
# 1 byte - ASCII
|
||||
if ( $ord0 >= 0 && $ord0 <= 127 ) {
|
||||
|
||||
$ord = $ord0;
|
||||
$increment = 1;
|
||||
|
||||
} else {
|
||||
|
||||
# 2 bytes
|
||||
$ord1 = ord($str{$i+1});
|
||||
|
||||
if ( $ord0 >= 192 && $ord0 <= 223 ) {
|
||||
|
||||
$ord = ( $ord0 - 192 ) * 64 + ( $ord1 - 128 );
|
||||
$increment = 2;
|
||||
|
||||
} else {
|
||||
|
||||
# 3 bytes
|
||||
$ord2 = ord($str{$i+2});
|
||||
|
||||
if ( $ord0 >= 224 && $ord0 <= 239 ) {
|
||||
|
||||
$ord = ($ord0-224)*4096 + ($ord1-128)*64 + ($ord2-128);
|
||||
$increment = 3;
|
||||
|
||||
} else {
|
||||
|
||||
# 4 bytes
|
||||
$ord3 = ord($str{$i+3});
|
||||
|
||||
if ($ord0>=240 && $ord0<=247) {
|
||||
|
||||
$ord = ($ord0-240)*262144 + ($ord1-128)*4096
|
||||
+ ($ord2-128)*64 + ($ord3-128);
|
||||
$increment = 4;
|
||||
|
||||
} else {
|
||||
|
||||
ob_end_clean();
|
||||
trigger_error("utf8_to_ascii: looks like badly formed UTF-8 at byte $i");
|
||||
return FALSE;
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
$bank = $ord >> 8;
|
||||
|
||||
# If we haven't used anything from this bank before, need to load it...
|
||||
if ( !array_key_exists($bank, $UTF8_TO_ASCII) ) {
|
||||
|
||||
$bankfile = UTF8_TO_ASCII_DB. '/'. sprintf("x%02x",$bank).'.php';
|
||||
|
||||
if ( file_exists($bankfile) ) {
|
||||
|
||||
# Load the appropriate database
|
||||
if ( !include $bankfile ) {
|
||||
ob_end_clean();
|
||||
trigger_error("utf8_to_ascii: unable to load $bankfile");
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
# Some banks are deliberately empty
|
||||
$UTF8_TO_ASCII[$bank] = array();
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
$newchar = $ord & 255;
|
||||
|
||||
if ( array_key_exists($newchar, $UTF8_TO_ASCII[$bank]) ) {
|
||||
echo $UTF8_TO_ASCII[$bank][$newchar];
|
||||
} else {
|
||||
echo $unknown;
|
||||
}
|
||||
|
||||
$i += $increment;
|
||||
|
||||
}
|
||||
|
||||
$str = ob_get_contents();
|
||||
ob_end_clean();
|
||||
return $str;
|
||||
|
||||
}
|
Loading…
Reference in a new issue