| CODENOTIFIER | HelpYou are not signed inSign in |
Project: Hpricot
Revision: 165
Author: stepheneb
Date: 13 Apr 2008 01:31:43
Diff at Trac: http://code.whytheluckystiff.net/hpricot/changeset/165
Changes:adding fogotton file: ext/fast_xs/FastXsService.java
Files:| ... | ...@@ -0,0 +1,1018 @@ | |
| 1 | ||
| 2 | import java.io.IOException; | |
| 3 | import java.io.StringWriter; | |
| 4 | import java.io.Writer; | |
| 5 | import java.util.HashMap; | |
| 6 | import java.util.Map; | |
| 7 | import java.util.TreeMap; | |
| 8 | import org.jruby.Ruby; | |
| 9 | import org.jruby.RubyModule; | |
| 10 | import org.jruby.runtime.CallbackFactory; | |
| 11 | import org.jruby.runtime.builtin.IRubyObject; | |
| 12 | import org.jruby.runtime.load.BasicLibraryService; | |
| 13 | import org.jruby.util.collections.IntHashMap; | |
| 14 | ||
| 15 | public class FastXsService implements BasicLibraryService { | |
| 16 | ||
| 17 | public boolean basicLoad(final Ruby runtime) throws IOException { | |
| 18 | RubyModule string = runtime.getModule("String"); | |
| 19 | CallbackFactory fact = runtime.callbackFactory(FastXsService.class); | |
| 20 | string.defineMethod("fast_xs",fact.getFastSingletonMethod("fast_xs")); | |
| 21 | return true; | |
| 22 | } | |
| 23 | ||
| 24 | public static IRubyObject fast_xs(IRubyObject recv) { | |
| 25 | String string = recv.convertToString().getUnicodeValue(); | |
| 26 | StringWriter writer = new StringWriter ((int)(string.length() * 1.5)); | |
| 27 | try { | |
| 28 | Entities.HTML40.escape(writer, string); | |
| 29 | return recv.getRuntime().newString(writer.toString()); | |
| 30 | } catch (IOException e) { | |
| 31 | throw recv.getRuntime().newIOErrorFromException(e); | |
| 32 | } | |
| 33 | } | |
| 34 | } | |
| 35 | ||
| 36 | // From Apache commons-lang, | |
| 37 | // http://svn.apache.org/viewvc/commons/proper/lang/trunk/src/java/org/apache/commons/lang/Entities.java?revision=560660&view=markup | |
| 38 | /* | |
| 39 | * Licensed to the Apache Software Foundation (ASF) under one or more | |
| 40 | * contributor license agreements. See the NOTICE file distributed with | |
| 41 | * this work for additional information regarding copyright ownership. | |
| 42 | * The ASF licenses this file to You under the Apache License, Version 2.0 | |
| 43 | * (the "License"); you may not use this file except in compliance with | |
| 44 | * the License. You may obtain a copy of the License at | |
| 45 | * | |
| 46 | * http://www.apache.org/licenses/LICENSE-2.0 | |
| 47 | * | |
| 48 | * Unless required by applicable law or agreed to in writing, software | |
| 49 | * distributed under the License is distributed on an "AS IS" BASIS, | |
| 50 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 51 | * See the License for the specific language governing permissions and | |
| 52 | * limitations under the License. | |
| 53 | */ | |
| 54 | ||
| 55 | /** | |
| 56 | * <p> | |
| 57 | * Provides HTML and XML entity utilities. | |
| 58 | * </p> | |
| 59 | * | |
| 60 | * @see <a href="http://hotwired.lycos.com/webmonkey/reference/special_characters/">ISO Entities</a> | |
| 61 | * @see <a href="http://www.w3.org/TR/REC-html32#latin1">HTML 3.2 Character Entities for ISO Latin-1</a> | |
| 62 | * @see <a href="http://www.w3.org/TR/REC-html40/sgml/entities.html">HTML 4.0 Character entity references</a> | |
| 63 | * @see <a href="http://www.w3.org/TR/html401/charset.html#h-5.3">HTML 4.01 Character References</a> | |
| 64 | * @see <a href="http://www.w3.org/TR/html401/charset.html#code-position">HTML 4.01 Code positions</a> | |
| 65 | * | |
| 66 | * @author <a href="mailto:alex@purpletech.com">Alexander Day Chaffee</a> | |
| 67 | * @author <a href="mailto:ggregory@seagullsw.com">Gary Gregory</a> | |
| 68 | * @since 2.0 | |
| 69 | * @version $Id$ | |
| 70 | */ | |
| 71 | class Entities { | |
| 72 | ||
| 73 | private static final String[][] BASIC_ARRAY = {{"quot", "34"}, // " - double-quote | |
| 74 | {"amp", "38"}, // & - ampersand | |
| 75 | {"lt", "60"}, // < - less-than | |
| 76 | {"gt", "62"}, // > - greater-than | |
| 77 | }; | |
| 78 | ||
| 79 | private static final String[][] APOS_ARRAY = {{"apos", "39"}, // XML apostrophe | |
| 80 | }; | |
| 81 | ||
| 82 | // package scoped for testing | |
| 83 | static final String[][] ISO8859_1_ARRAY = {{"nbsp", "160"}, // non-breaking space | |
| 84 | {"iexcl", "161"}, // inverted exclamation mark | |
| 85 | {"cent", "162"}, // cent sign | |
| 86 | {"pound", "163"}, // pound sign | |
| 87 | {"curren", "164"}, // currency sign | |
| 88 | {"yen", "165"}, // yen sign = yuan sign | |
| 89 | {"brvbar", "166"}, // broken bar = broken vertical bar | |
| 90 | {"sect", "167"}, // section sign | |
| 91 | {"uml", "168"}, // diaeresis = spacing diaeresis | |
| 92 | {"copy", "169"}, // © - copyright sign | |
| 93 | {"ordf", "170"}, // feminine ordinal indicator | |
| 94 | {"laquo", "171"}, // left-pointing double angle quotation mark = left pointing guillemet | |
| 95 | {"not", "172"}, // not sign | |
| 96 | {"shy", "173"}, // soft hyphen = discretionary hyphen | |
| 97 | {"reg", "174"}, // ® - registered trademark sign | |
| 98 | {"macr", "175"}, // macron = spacing macron = overline = APL overbar | |
| 99 | {"deg", "176"}, // degree sign | |
| 100 | {"plusmn", "177"}, // plus-minus sign = plus-or-minus sign | |
| 101 | {"sup2", "178"}, // superscript two = superscript digit two = squared | |
| 102 | {"sup3", "179"}, // superscript three = superscript digit three = cubed | |
| 103 | {"acute", "180"}, // acute accent = spacing acute | |
| 104 | {"micro", "181"}, // micro sign | |
| 105 | {"para", "182"}, // pilcrow sign = paragraph sign | |
| 106 | {"middot", "183"}, // middle dot = Georgian comma = Greek middle dot | |
| 107 | {"cedil", "184"}, // cedilla = spacing cedilla | |
| 108 | {"sup1", "185"}, // superscript one = superscript digit one | |
| 109 | {"ordm", "186"}, // masculine ordinal indicator | |
| 110 | {"raquo", "187"}, // right-pointing double angle quotation mark = right pointing guillemet | |
| 111 | {"frac14", "188"}, // vulgar fraction one quarter = fraction one quarter | |
| 112 | {"frac12", "189"}, // vulgar fraction one half = fraction one half | |
| 113 | {"frac34", "190"}, // vulgar fraction three quarters = fraction three quarters | |
| 114 | {"iquest", "191"}, // inverted question mark = turned question mark | |
| 115 | {"Agrave", "192"}, // À - uppercase A, grave accent | |
| 116 | {"Aacute", "193"}, // Á - uppercase A, acute accent | |
| 117 | {"Acirc", "194"}, // Â - uppercase A, circumflex accent | |
| 118 | {"Atilde", "195"}, // Ã - uppercase A, tilde | |
| 119 | {"Auml", "196"}, // Ä - uppercase A, umlaut | |
| 120 | {"Aring", "197"}, // Å - uppercase A, ring | |
| 121 | {"AElig", "198"}, // Æ - uppercase AE | |
| 122 | {"Ccedil", "199"}, // Ç - uppercase C, cedilla | |
| 123 | {"Egrave", "200"}, // È - uppercase E, grave accent | |
| 124 | {"Eacute", "201"}, // É - uppercase E, acute accent | |
| 125 | {"Ecirc", "202"}, // Ê - uppercase E, circumflex accent | |
| 126 | {"Euml", "203"}, // Ë - uppercase E, umlaut | |
| 127 | {"Igrave", "204"}, // Ì - uppercase I, grave accent | |
| 128 | {"Iacute", "205"}, // Í - uppercase I, acute accent | |
| 129 | {"Icirc", "206"}, // Î - uppercase I, circumflex accent | |
| 130 | {"Iuml", "207"}, // Ï - uppercase I, umlaut | |
| 131 | {"ETH", "208"}, // Ð - uppercase Eth, Icelandic | |
| 132 | {"Ntilde", "209"}, // Ñ - uppercase N, tilde | |
| 133 | {"Ograve", "210"}, // Ò - uppercase O, grave accent | |
| 134 | {"Oacute", "211"}, // Ó - uppercase O, acute accent | |
| 135 | {"Ocirc", "212"}, // Ô - uppercase O, circumflex accent | |
| 136 | {"Otilde", "213"}, // Õ - uppercase O, tilde | |
| 137 | {"Ouml", "214"}, // Ö - uppercase O, umlaut | |
| 138 | {"times", "215"}, // multiplication sign | |
| 139 | {"Oslash", "216"}, // Ø - uppercase O, slash | |
| 140 | {"Ugrave", "217"}, // Ù - uppercase U, grave accent | |
| 141 | {"Uacute", "218"}, // Ú - uppercase U, acute accent | |
| 142 | {"Ucirc", "219"}, // Û - uppercase U, circumflex accent | |
| 143 | {"Uuml", "220"}, // Ü - uppercase U, umlaut | |
| 144 | {"Yacute", "221"}, // Ý - uppercase Y, acute accent | |
| 145 | {"THORN", "222"}, // Þ - uppercase THORN, Icelandic | |
| 146 | {"szlig", "223"}, // ß - lowercase sharps, German | |
| 147 | {"agrave", "224"}, // à - lowercase a, grave accent | |
| 148 | {"aacute", "225"}, // á - lowercase a, acute accent | |
| 149 | {"acirc", "226"}, // â - lowercase a, circumflex accent | |
| 150 | {"atilde", "227"}, // ã - lowercase a, tilde | |
| 151 | {"auml", "228"}, // ä - lowercase a, umlaut | |
| 152 | {"aring", "229"}, // å - lowercase a, ring | |
| 153 | {"aelig", "230"}, // æ - lowercase ae | |
| 154 | {"ccedil", "231"}, // ç - lowercase c, cedilla | |
| 155 | {"egrave", "232"}, // è - lowercase e, grave accent | |
| 156 | {"eacute", "233"}, // é - lowercase e, acute accent | |
| 157 | {"ecirc", "234"}, // ê - lowercase e, circumflex accent | |
| 158 | {"euml", "235"}, // ë - lowercase e, umlaut | |
| 159 | {"igrave", "236"}, // ì - lowercase i, grave accent | |
| 160 | {"iacute", "237"}, // í - lowercase i, acute accent | |
| 161 | {"icirc", "238"}, // î - lowercase i, circumflex accent | |
| 162 | {"iuml", "239"}, // ï - lowercase i, umlaut | |
| 163 | {"eth", "240"}, // ð - lowercase eth, Icelandic | |
| 164 | {"ntilde", "241"}, // ñ - lowercase n, tilde | |
| 165 | {"ograve", "242"}, // ò - lowercase o, grave accent | |
| 166 | {"oacute", "243"}, // ó - lowercase o, acute accent | |
| 167 | {"ocirc", "244"}, // ô - lowercase o, circumflex accent | |
| 168 | {"otilde", "245"}, // õ - lowercase o, tilde | |
| 169 | {"ouml", "246"}, // ö - lowercase o, umlaut | |
| 170 | {"divide", "247"}, // division sign | |
| 171 | {"oslash", "248"}, // ø - lowercase o, slash | |
| 172 | {"ugrave", "249"}, // ù - lowercase u, grave accent | |
| 173 | {"uacute", "250"}, // ú - lowercase u, acute accent | |
| 174 | {"ucirc", "251"}, // û - lowercase u, circumflex accent | |
| 175 | {"uuml", "252"}, // ü - lowercase u, umlaut | |
| 176 | {"yacute", "253"}, // ý - lowercase y, acute accent | |
| 177 | {"thorn", "254"}, // þ - lowercase thorn, Icelandic | |
| 178 | {"yuml", "255"}, // ÿ - lowercase y, umlaut | |
| 179 | }; | |
| 180 | ||
| 181 | // http://www.w3.org/TR/REC-html40/sgml/entities.html | |
| 182 | // package scoped for testing | |
| 183 | static final String[][] HTML40_ARRAY = { | |
| 184 | // <!-- Latin Extended-B --> | |
| 185 | {"fnof", "402"}, // latin small f with hook = function= florin, U+0192 ISOtech --> | |
| 186 | // <!-- Greek --> | |
| 187 | {"Alpha", "913"}, // greek capital letter alpha, U+0391 --> | |
| 188 | {"Beta", "914"}, // greek capital letter beta, U+0392 --> | |
| 189 | {"Gamma", "915"}, // greek capital letter gamma,U+0393 ISOgrk3 --> | |
| 190 | {"Delta", "916"}, // greek capital letter delta,U+0394 ISOgrk3 --> | |
| 191 | {"Epsilon", "917"}, // greek capital letter epsilon, U+0395 --> | |
| 192 | {"Zeta", "918"}, // greek capital letter zeta, U+0396 --> | |
| 193 | {"Eta", "919"}, // greek capital letter eta, U+0397 --> | |
| 194 | {"Theta", "920"}, // greek capital letter theta,U+0398 ISOgrk3 --> | |
| 195 | {"Iota", "921"}, // greek capital letter iota, U+0399 --> | |
| 196 | {"Kappa", "922"}, // greek capital letter kappa, U+039A --> | |
| 197 | {"Lambda", "923"}, // greek capital letter lambda,U+039B ISOgrk3 --> | |
| 198 | {"Mu", "924"}, // greek capital letter mu, U+039C --> | |
| 199 | {"Nu", "925"}, // greek capital letter nu, U+039D --> | |
| 200 | {"Xi", "926"}, // greek capital letter xi, U+039E ISOgrk3 --> | |
| 201 | {"Omicron", "927"}, // greek capital letter omicron, U+039F --> | |
| 202 | {"Pi", "928"}, // greek capital letter pi, U+03A0 ISOgrk3 --> | |
| 203 | {"Rho", "929"}, // greek capital letter rho, U+03A1 --> | |
| 204 | // <!-- there is no Sigmaf, and no U+03A2 character either --> | |
| 205 | {"Sigma", "931"}, // greek capital letter sigma,U+03A3 ISOgrk3 --> | |
| 206 | {"Tau", "932"}, // greek capital letter tau, U+03A4 --> | |
| 207 | {"Upsilon", "933"}, // greek capital letter upsilon,U+03A5 ISOgrk3 --> | |
| 208 | {"Phi", "934"}, // greek capital letter phi,U+03A6 ISOgrk3 --> | |
| 209 | {"Chi", "935"}, // greek capital letter chi, U+03A7 --> | |
| 210 | {"Psi", "936"}, // greek capital letter psi,U+03A8 ISOgrk3 --> | |
| 211 | {"Omega", "937"}, // greek capital letter omega,U+03A9 ISOgrk3 --> | |
| 212 | {"alpha", "945"}, // greek small letter alpha,U+03B1 ISOgrk3 --> | |
| 213 | {"beta", "946"}, // greek small letter beta, U+03B2 ISOgrk3 --> | |
| 214 | {"gamma", "947"}, // greek small letter gamma,U+03B3 ISOgrk3 --> | |
| 215 | {"delta", "948"}, // greek small letter delta,U+03B4 ISOgrk3 --> | |
| 216 | {"epsilon", "949"}, // greek small letter epsilon,U+03B5 ISOgrk3 --> | |
| 217 | {"zeta", "950"}, // greek small letter zeta, U+03B6 ISOgrk3 --> | |
| 218 | {"eta", "951"}, // greek small letter eta, U+03B7 ISOgrk3 --> | |
| 219 | {"theta", "952"}, // greek small letter theta,U+03B8 ISOgrk3 --> | |
| 220 | {"iota", "953"}, // greek small letter iota, U+03B9 ISOgrk3 --> | |
| 221 | {"kappa", "954"}, // greek small letter kappa,U+03BA ISOgrk3 --> | |
| 222 | {"lambda", "955"}, // greek small letter lambda,U+03BB ISOgrk3 --> | |
| 223 | {"mu", "956"}, // greek small letter mu, U+03BC ISOgrk3 --> | |
| 224 | {"nu", "957"}, // greek small letter nu, U+03BD ISOgrk3 --> | |
| 225 | {"xi", "958"}, // greek small letter xi, U+03BE ISOgrk3 --> | |
| 226 | {"omicron", "959"}, // greek small letter omicron, U+03BF NEW --> | |
| 227 | {"pi", "960"}, // greek small letter pi, U+03C0 ISOgrk3 --> | |
| 228 | {"rho", "961"}, // greek small letter rho, U+03C1 ISOgrk3 --> | |
| 229 | {"sigmaf", "962"}, // greek small letter final sigma,U+03C2 ISOgrk3 --> | |
| 230 | {"sigma", "963"}, // greek small letter sigma,U+03C3 ISOgrk3 --> | |
| 231 | {"tau", "964"}, // greek small letter tau, U+03C4 ISOgrk3 --> | |
| 232 | {"upsilon", "965"}, // greek small letter upsilon,U+03C5 ISOgrk3 --> | |
| 233 | {"phi", "966"}, // greek small letter phi, U+03C6 ISOgrk3 --> | |
| 234 | {"chi", "967"}, // greek small letter chi, U+03C7 ISOgrk3 --> | |
| 235 | {"psi", "968"}, // greek small letter psi, U+03C8 ISOgrk3 --> | |
| 236 | {"omega", "969"}, // greek small letter omega,U+03C9 ISOgrk3 --> | |
| 237 | {"thetasym", "977"}, // greek small letter theta symbol,U+03D1 NEW --> | |
| 238 | {"upsih", "978"}, // greek upsilon with hook symbol,U+03D2 NEW --> | |
| 239 | {"piv", "982"}, // greek pi symbol, U+03D6 ISOgrk3 --> | |
| 240 | // <!-- General Punctuation --> | |
| 241 | {"bull", "8226"}, // bullet = black small circle,U+2022 ISOpub --> | |
| 242 | // <!-- bullet is NOT the same as bullet operator, U+2219 --> | |
| 243 | {"hellip", "8230"}, // horizontal ellipsis = three dot leader,U+2026 ISOpub --> | |
| 244 | {"prime", "8242"}, // prime = minutes = feet, U+2032 ISOtech --> | |
| 245 | {"Prime", "8243"}, // double prime = seconds = inches,U+2033 ISOtech --> | |
| 246 | {"oline", "8254"}, // overline = spacing overscore,U+203E NEW --> | |
| 247 | {"frasl", "8260"}, // fraction slash, U+2044 NEW --> | |
| 248 | // <!-- Letterlike Symbols --> | |
| 249 | {"weierp", "8472"}, // script capital P = power set= Weierstrass p, U+2118 ISOamso --> | |
| 250 | {"image", "8465"}, // blackletter capital I = imaginary part,U+2111 ISOamso --> | |
| 251 | {"real", "8476"}, // blackletter capital R = real part symbol,U+211C ISOamso --> | |
| 252 | {"trade", "8482"}, // trade mark sign, U+2122 ISOnum --> | |
| 253 | {"alefsym", "8501"}, // alef symbol = first transfinite cardinal,U+2135 NEW --> | |
| 254 | // <!-- alef symbol is NOT the same as hebrew letter alef,U+05D0 although the | |
| 255 | // same glyph could be used to depict both characters --> | |
| 256 | // <!-- Arrows --> | |
| 257 | {"larr", "8592"}, // leftwards arrow, U+2190 ISOnum --> | |
| 258 | {"uarr", "8593"}, // upwards arrow, U+2191 ISOnum--> | |
| 259 | {"rarr", "8594"}, // rightwards arrow, U+2192 ISOnum --> | |
| 260 | {"darr", "8595"}, // downwards arrow, U+2193 ISOnum --> | |
| 261 | {"harr", "8596"}, // left right arrow, U+2194 ISOamsa --> | |
| 262 | {"crarr", "8629"}, // downwards arrow with corner leftwards= carriage return, U+21B5 NEW --> | |
| 263 | {"lArr", "8656"}, // leftwards double arrow, U+21D0 ISOtech --> | |
| 264 | // <!-- ISO 10646 does not say that lArr is the same as the 'is implied by' | |
| 265 | // arrow but also does not have any other character for that function. | |
| 266 | // So ? lArr canbe used for 'is implied by' as ISOtech suggests --> | |
| 267 | {"uArr", "8657"}, // upwards double arrow, U+21D1 ISOamsa --> | |
| 268 | {"rArr", "8658"}, // rightwards double arrow,U+21D2 ISOtech --> | |
| 269 | // <!-- ISO 10646 does not say this is the 'implies' character but does not | |
| 270 | // have another character with this function so ?rArr can be used for | |
| 271 | // 'implies' as ISOtech suggests --> | |
| 272 | {"dArr", "8659"}, // downwards double arrow, U+21D3 ISOamsa --> | |
| 273 | {"hArr", "8660"}, // left right double arrow,U+21D4 ISOamsa --> | |
| 274 | // <!-- Mathematical Operators --> | |
| 275 | {"forall", "8704"}, // for all, U+2200 ISOtech --> | |
| 276 | {"part", "8706"}, // partial differential, U+2202 ISOtech --> | |
| 277 | {"exist", "8707"}, // there exists, U+2203 ISOtech --> | |
| 278 | {"empty", "8709"}, // empty set = null set = diameter,U+2205 ISOamso --> | |
| 279 | {"nabla", "8711"}, // nabla = backward difference,U+2207 ISOtech --> | |
| 280 | {"isin", "8712"}, // element of, U+2208 ISOtech --> | |
| 281 | {"notin", "8713"}, // not an element of, U+2209 ISOtech --> | |
| 282 | {"ni", "8715"}, // contains as member, U+220B ISOtech --> | |
| 283 | // <!-- should there be a more memorable name than 'ni'? --> | |
| 284 | {"prod", "8719"}, // n-ary product = product sign,U+220F ISOamsb --> | |
| 285 | // <!-- prod is NOT the same character as U+03A0 'greek capital letter pi' | |
| 286 | // though the same glyph might be used for both --> | |
| 287 | {"sum", "8721"}, // n-ary summation, U+2211 ISOamsb --> | |
| 288 | // <!-- sum is NOT the same character as U+03A3 'greek capital letter sigma' | |
| 289 | // though the same glyph might be used for both --> | |
| 290 | {"minus", "8722"}, // minus sign, U+2212 ISOtech --> | |
| 291 | {"lowast", "8727"}, // asterisk operator, U+2217 ISOtech --> | |
| 292 | {"radic", "8730"}, // square root = radical sign,U+221A ISOtech --> | |
| 293 | {"prop", "8733"}, // proportional to, U+221D ISOtech --> | |
| 294 | {"infin", "8734"}, // infinity, U+221E ISOtech --> | |
| 295 | {"ang", "8736"}, // angle, U+2220 ISOamso --> | |
| 296 | {"and", "8743"}, // logical and = wedge, U+2227 ISOtech --> | |
| 297 | {"or", "8744"}, // logical or = vee, U+2228 ISOtech --> | |
| 298 | {"cap", "8745"}, // intersection = cap, U+2229 ISOtech --> | |
| 299 | {"cup", "8746"}, // union = cup, U+222A ISOtech --> | |
| 300 | {"int", "8747"}, // integral, U+222B ISOtech --> | |
| 301 | {"there4", "8756"}, // therefore, U+2234 ISOtech --> | |
| 302 | {"sim", "8764"}, // tilde operator = varies with = similar to,U+223C ISOtech --> | |
| 303 | // <!-- tilde operator is NOT the same character as the tilde, U+007E,although | |
| 304 | // the same glyph might be used to represent both --> | |
| 305 | {"cong", "8773"}, // approximately equal to, U+2245 ISOtech --> | |
| 306 | {"asymp", "8776"}, // almost equal to = asymptotic to,U+2248 ISOamsr --> | |
| 307 | {"ne", "8800"}, // not equal to, U+2260 ISOtech --> | |
| 308 | {"equiv", "8801"}, // identical to, U+2261 ISOtech --> | |
| 309 | {"le", "8804"}, // less-than or equal to, U+2264 ISOtech --> | |
| 310 | {"ge", "8805"}, // greater-than or equal to,U+2265 ISOtech --> | |
| 311 | {"sub", "8834"}, // subset of, U+2282 ISOtech --> | |
| 312 | {"sup", "8835"}, // superset of, U+2283 ISOtech --> | |
| 313 | // <!-- note that nsup, 'not a superset of, U+2283' is not covered by the | |
| 314 | // Symbol font encoding and is not included. Should it be, for symmetry? | |
| 315 | // It is in ISOamsn --> <!ENTITY nsub", "8836"}, | |
| 316 | // not a subset of, U+2284 ISOamsn --> | |
| 317 | {"sube", "8838"}, // subset of or equal to, U+2286 ISOtech --> | |
| 318 | {"supe", "8839"}, // superset of or equal to,U+2287 ISOtech --> | |
| 319 | {"oplus", "8853"}, // circled plus = direct sum,U+2295 ISOamsb --> | |
| 320 | {"otimes", "8855"}, // circled times = vector product,U+2297 ISOamsb --> | |
| 321 | {"perp", "8869"}, // up tack = orthogonal to = perpendicular,U+22A5 ISOtech --> | |
| 322 | {"sdot", "8901"}, // dot operator, U+22C5 ISOamsb --> | |
| 323 | // <!-- dot operator is NOT the same character as U+00B7 middle dot --> | |
| 324 | // <!-- Miscellaneous Technical --> | |
| 325 | {"lceil", "8968"}, // left ceiling = apl upstile,U+2308 ISOamsc --> | |
| 326 | {"rceil", "8969"}, // right ceiling, U+2309 ISOamsc --> | |
| 327 | {"lfloor", "8970"}, // left floor = apl downstile,U+230A ISOamsc --> | |
| 328 | {"rfloor", "8971"}, // right floor, U+230B ISOamsc --> | |
| 329 | {"lang", "9001"}, // left-pointing angle bracket = bra,U+2329 ISOtech --> | |
| 330 | // <!-- lang is NOT the same character as U+003C 'less than' or U+2039 'single left-pointing angle quotation | |
| 331 | // mark' --> | |
| 332 | {"rang", "9002"}, // right-pointing angle bracket = ket,U+232A ISOtech --> | |
| 333 | // <!-- rang is NOT the same character as U+003E 'greater than' or U+203A | |
| 334 | // 'single right-pointing angle quotation mark' --> | |
| 335 | // <!-- Geometric Shapes --> | |
| 336 | {"loz", "9674"}, // lozenge, U+25CA ISOpub --> | |
| 337 | // <!-- Miscellaneous Symbols --> | |
| 338 | {"spades", "9824"}, // black spade suit, U+2660 ISOpub --> | |
| 339 | // <!-- black here seems to mean filled as opposed to hollow --> | |
| 340 | {"clubs", "9827"}, // black club suit = shamrock,U+2663 ISOpub --> | |
| 341 | {"hearts", "9829"}, // black heart suit = valentine,U+2665 ISOpub --> | |
| 342 | {"diams", "9830"}, // black diamond suit, U+2666 ISOpub --> | |
| 343 | ||
| 344 | // <!-- Latin Extended-A --> | |
| 345 | {"OElig", "338"}, // -- latin capital ligature OE,U+0152 ISOlat2 --> | |
| 346 | {"oelig", "339"}, // -- latin small ligature oe, U+0153 ISOlat2 --> | |
| 347 | // <!-- ligature is a misnomer, this is a separate character in some languages --> | |
| 348 | {"Scaron", "352"}, // -- latin capital letter S with caron,U+0160 ISOlat2 --> | |
| 349 | {"scaron", "353"}, // -- latin small letter s with caron,U+0161 ISOlat2 --> | |
| 350 | {"Yuml", "376"}, // -- latin capital letter Y with diaeresis,U+0178 ISOlat2 --> | |
| 351 | // <!-- Spacing Modifier Letters --> | |
| 352 | {"circ", "710"}, // -- modifier letter circumflex accent,U+02C6 ISOpub --> | |
| 353 | {"tilde", "732"}, // small tilde, U+02DC ISOdia --> | |
| 354 | // <!-- General Punctuation --> | |
| 355 | {"ensp", "8194"}, // en space, U+2002 ISOpub --> | |
| 356 | {"emsp", "8195"}, // em space, U+2003 ISOpub --> | |
| 357 | {"thinsp", "8201"}, // thin space, U+2009 ISOpub --> | |
| 358 | {"zwnj", "8204"}, // zero width non-joiner,U+200C NEW RFC 2070 --> | |
| 359 | {"zwj", "8205"}, // zero width joiner, U+200D NEW RFC 2070 --> | |
| 360 | {"lrm", "8206"}, // left-to-right mark, U+200E NEW RFC 2070 --> | |
| 361 | {"rlm", "8207"}, // right-to-left mark, U+200F NEW RFC 2070 --> | |
| 362 | {"ndash", "8211"}, // en dash, U+2013 ISOpub --> | |
| 363 | {"mdash", "8212"}, // em dash, U+2014 ISOpub --> | |
| 364 | {"lsquo", "8216"}, // left single quotation mark,U+2018 ISOnum --> | |
| 365 | {"rsquo", "8217"}, // right single quotation mark,U+2019 ISOnum --> | |
| 366 | {"sbquo", "8218"}, // single low-9 quotation mark, U+201A NEW --> | |
| 367 | {"ldquo", "8220"}, // left double quotation mark,U+201C ISOnum --> | |
| 368 | {"rdquo", "8221"}, // right double quotation mark,U+201D ISOnum --> | |
| 369 | {"bdquo", "8222"}, // double low-9 quotation mark, U+201E NEW --> | |
| 370 | {"dagger", "8224"}, // dagger, U+2020 ISOpub --> | |
| 371 | {"Dagger", "8225"}, // double dagger, U+2021 ISOpub --> | |
| 372 | {"permil", "8240"}, // per mille sign, U+2030 ISOtech --> | |
| 373 | {"lsaquo", "8249"}, // single left-pointing angle quotation mark,U+2039 ISO proposed --> | |
| 374 | // <!-- lsaquo is proposed but not yet ISO standardized --> | |
| 375 | {"rsaquo", "8250"}, // single right-pointing angle quotation mark,U+203A ISO proposed --> | |
| 376 | // <!-- rsaquo is proposed but not yet ISO standardized --> | |
| 377 | {"euro", "8364"}, // -- euro sign, U+20AC NEW --> | |
| 378 | }; | |
| 379 | ||
| 380 | /** | |
| 381 | * <p> | |
| 382 | * The set of entities supported by standard XML. | |
| 383 | * </p> | |
| 384 | */ | |
| 385 | public static final Entities XML; | |
| 386 | ||
| 387 | /** | |
| 388 | * <p> | |
| 389 | * The set of entities supported by HTML 3.2. | |
| 390 | * </p> | |
| 391 | */ | |
| 392 | public static final Entities HTML32; | |
| 393 | ||
| 394 | /** | |
| 395 | * <p> | |
| 396 | * The set of entities supported by HTML 4.0. | |
| 397 | * </p> | |
| 398 | */ | |
| 399 | public static final Entities HTML40; | |
| 400 | ||
| 401 | static { | |
| 402 | XML = new Entities(); | |
| 403 | XML.addEntities(BASIC_ARRAY); | |
| 404 | XML.addEntities(APOS_ARRAY); | |
| 405 | } | |
| 406 | ||
| 407 | static { | |
| 408 | HTML32 = new Entities(); | |
| 409 | HTML32.addEntities(BASIC_ARRAY); | |
| 410 | HTML32.addEntities(ISO8859_1_ARRAY); | |
| 411 | } | |
| 412 | ||
| 413 | static { | |
| 414 | HTML40 = new Entities(); | |
| 415 | fillWithHtml40Entities(HTML40); | |
| 416 | } | |
| 417 | ||
| 418 | /** | |
| 419 | * <p> | |
| 420 | * Fills the specified entities instance with HTML 40 entities. | |
| 421 | * </p> | |
| 422 | * | |
| 423 | * @param entities | |
| 424 | * the instance to be filled. | |
| 425 | */ | |
| 426 | static void fillWithHtml40Entities(Entities entities) { | |
| 427 | entities.addEntities(BASIC_ARRAY); | |
| 428 | entities.addEntities(ISO8859_1_ARRAY); | |
| 429 | entities.addEntities(HTML40_ARRAY); | |
| 430 | } | |
| 431 | ||
| 432 | static interface EntityMap { | |
| 433 | /** | |
| 434 | * <p> | |
| 435 | * Add an entry to this entity map. | |
| 436 | * </p> | |
| 437 | * | |
| 438 | * @param name | |
| 439 | * the entity name | |
| 440 | * @param value | |
| 441 | * the entity value | |
| 442 | */ | |
| 443 | void add(String name, int value); | |
| 444 | ||
| 445 | /** | |
| 446 | * <p> | |
| 447 | * Returns the name of the entity identified by the specified value. | |
| 448 | * </p> | |
| 449 | * | |
| 450 | * @param value | |
| 451 | * the value to locate | |
| 452 | * @return entity name associated with the specified value | |
| 453 | */ | |
| 454 | String name(int value); | |
| 455 | ||
| 456 | /** | |
| 457 | * <p> | |
| 458 | * Returns the value of the entity identified by the specified name. | |
| 459 | * </p> | |
| 460 | * | |
| 461 | * @param name | |
| 462 | * the name to locate | |
| 463 | * @return entity value associated with the specified name | |
| 464 | */ | |
| 465 | int value(String name); | |
| 466 | } | |
| 467 | ||
| 468 | static class PrimitiveEntityMap implements EntityMap { | |
| 469 | private Map mapNameToValue = new HashMap(); | |
| 470 | ||
| 471 | private IntHashMap mapValueToName = new IntHashMap(); | |
| 472 | ||
| 473 | /** | |
| 474 | * {@inheritDoc} | |
| 475 | */ | |
| 476 | public void add(String name, int value) { | |
| 477 | mapNameToValue.put(name, new Integer(value)); | |
| 478 | mapValueToName.put(value, name); | |
| 479 | } | |
| 480 | ||
| 481 | /** | |
| 482 | * {@inheritDoc} | |
| 483 | */ | |
| 484 | public String name(int value) { | |
| 485 | return (String) mapValueToName.get(value); | |
| 486 | } | |
| 487 | ||
| 488 | /** | |
| 489 | * {@inheritDoc} | |
| 490 | */ | |
| 491 | public int value(String name) { | |
| 492 | Object value = mapNameToValue.get(name); | |
| 493 | if (value == null) { | |
| 494 | return -1; | |
| 495 | } | |
| 496 | return ((Integer) value).intValue(); | |
| 497 | } | |
| 498 | } | |
| 499 | ||
| 500 | static abstract class MapIntMap implements Entities.EntityMap { | |
| 501 | protected Map mapNameToValue; | |
| 502 | ||
| 503 | protected Map mapValueToName; | |
| 504 | ||
| 505 | /** | |
| 506 | * {@inheritDoc} | |
| 507 | */ | |
| 508 | public void add(String name, int value) { | |
| 509 | mapNameToValue.put(name, new Integer(value)); | |
| 510 | mapValueToName.put(new Integer(value), name); | |
| 511 | } | |
| 512 | ||
| 513 | /** | |
| 514 | * {@inheritDoc} | |
| 515 | */ | |
| 516 | public String name(int value) { | |
| 517 | return (String) mapValueToName.get(new Integer(value)); | |
| 518 | } | |
| 519 | ||
| 520 | /** | |
| 521 | * {@inheritDoc} | |
| 522 | */ | |
| 523 | public int value(String name) { | |
| 524 | Object value = mapNameToValue.get(name); | |
| 525 | if (value == null) { | |
| 526 | return -1; | |
| 527 | } | |
| 528 | return ((Integer) value).intValue(); | |
| 529 | } | |
| 530 | } | |
| 531 | ||
| 532 | static class HashEntityMap extends MapIntMap { | |
| 533 | /** | |
| 534 | * Constructs a new instance of <code>HashEntityMap</code>. | |
| 535 | */ | |
| 536 | public HashEntityMap() { | |
| 537 | mapNameToValue = new HashMap(); | |
| 538 | mapValueToName = new HashMap(); | |
| 539 | } | |
| 540 | } | |
| 541 | ||
| 542 | static class TreeEntityMap extends MapIntMap { | |
| 543 | /** | |
| 544 | * Constructs a new instance of <code>TreeEntityMap</code>. | |
| 545 | */ | |
| 546 | public TreeEntityMap() { | |
| 547 | mapNameToValue = new TreeMap(); | |
| 548 | mapValueToName = new TreeMap(); | |
| 549 | } | |
| 550 | } | |
| 551 | ||
| 552 | static class LookupEntityMap extends PrimitiveEntityMap { | |
| 553 | private String[] lookupTable; | |
| 554 | ||
| 555 | private int LOOKUP_TABLE_SIZE = 256; | |
| 556 | ||
| 557 | /** | |
| 558 | * {@inheritDoc} | |
| 559 | */ | |
| 560 | public String name(int value) { | |
| 561 | if (value < LOOKUP_TABLE_SIZE) { | |
| 562 | return lookupTable()[value]; | |
| 563 | } | |
| 564 | return super.name(value); | |
| 565 | } | |
| 566 | ||
| 567 | /** | |
| 568 | * <p> | |
| 569 | * Returns the lookup table for this entity map. The lookup table is created if it has not been previously. | |
| 570 | * </p> | |
| 571 | * | |
| 572 | * @return the lookup table | |
| 573 | */ | |
| 574 | private String[] lookupTable() { | |
| 575 | if (lookupTable == null) { | |
| 576 | createLookupTable(); | |
| 577 | } | |
| 578 | return lookupTable; | |
| 579 | } | |
| 580 | ||
| 581 | /** | |
| 582 | * <p> | |
| 583 | * Creates an entity lookup table of LOOKUP_TABLE_SIZE elements, initialized with entity names. | |
| 584 | * </p> | |
| 585 | */ | |
| 586 | private void createLookupTable() { | |
| 587 | lookupTable = new String[LOOKUP_TABLE_SIZE]; | |
| 588 | for (int i = 0; i < LOOKUP_TABLE_SIZE; ++i) { | |
| 589 | lookupTable[i] = super.name(i); | |
| 590 | } | |
| 591 | } | |
| 592 | } | |
| 593 | ||
| 594 | static class ArrayEntityMap implements EntityMap { | |
| 595 | protected int growBy = 100; | |
| 596 | ||
| 597 | protected int size = 0; | |
| 598 | ||
| 599 | protected String[] names; | |
| 600 | ||
| 601 | protected int[] values; | |
| 602 | ||
| 603 | /** | |
| 604 | * Constructs a new instance of <code>ArrayEntityMap</code>. | |
| 605 | */ | |
| 606 | public ArrayEntityMap() { | |
| 607 | names = new String[growBy]; | |
| 608 | values = new int[growBy]; | |
| 609 | } | |
| 610 | ||
| 611 | /** | |
| 612 | * Constructs a new instance of <code>ArrayEntityMap</code> specifying the size by which the array should | |
| 613 | * grow. | |
| 614 | * | |
| 615 | * @param growBy | |
| 616 | * array will be initialized to and will grow by this amount | |
| 617 | */ | |
| 618 | public ArrayEntityMap(int growBy) { | |
| 619 | this.growBy = growBy; | |
| 620 | names = new String[growBy]; | |
| 621 | values = new int[growBy]; | |
| 622 | } | |
| 623 | ||
| 624 | /** | |
| 625 | * {@inheritDoc} | |
| 626 | */ | |
| 627 | public void add(String name, int value) { | |
| 628 | ensureCapacity(size + 1); | |
| 629 | names[size] = name; | |
| 630 | values[size] = value; | |
| 631 | size++; | |
| 632 | } | |
| 633 | ||
| 634 | /** | |
| 635 | * Verifies the capacity of the entity array, adjusting the size if necessary. | |
| 636 | * | |
| 637 | * @param capacity | |
| 638 | * size the array should be | |
| 639 | */ | |
| 640 | protected void ensureCapacity(int capacity) { | |
| 641 | if (capacity > names.length) { | |
| 642 | int newSize = Math.max(capacity, size + growBy); | |
| 643 | String[] newNames = new String[newSize]; | |
| 644 | System.arraycopy(names, 0, newNames, 0, size); | |
| 645 | names = newNames; | |
| 646 | int[] newValues = new int[newSize]; | |
| 647 | System.arraycopy(values, 0, newValues, 0, size); | |
| 648 | values = newValues; | |
| 649 | } | |
| 650 | } | |
| 651 | ||
| 652 | /** | |
| 653 | * {@inheritDoc} | |
| 654 | */ | |
| 655 | public String name(int value) { | |
| 656 | for (int i = 0; i < size; ++i) { | |
| 657 | if (values[i] == value) { | |
| 658 | return names[i]; | |
| 659 | } | |
| 660 | } | |
| 661 | return null; | |
| 662 | } | |
| 663 | ||
| 664 | /** | |
| 665 | * {@inheritDoc} | |
| 666 | */ | |
| 667 | public int value(String name) { | |
| 668 | for (int i = 0; i < size; ++i) { | |
| 669 | if (names[i].equals(name)) { | |
| 670 | return values[i]; | |
| 671 | } | |
| 672 | } | |
| 673 | return -1; | |
| 674 | } | |
| 675 | } | |
| 676 | ||
| 677 | static class BinaryEntityMap extends ArrayEntityMap { | |
| 678 | ||
| 679 | /** | |
| 680 | * Constructs a new instance of <code>BinaryEntityMap</code>. | |
| 681 | */ | |
| 682 | public BinaryEntityMap() { | |
| 683 | super(); | |
| 684 | } | |
| 685 | ||
| 686 | /** | |
| 687 | * Constructs a new instance of <code>ArrayEntityMap</code> specifying the size by which the underlying array | |
| 688 | * should grow. | |
| 689 | * | |
| 690 | * @param growBy | |
| 691 | * array will be initialized to and will grow by this amount | |
| 692 | */ | |
| 693 | public BinaryEntityMap(int growBy) { | |
| 694 | super(growBy); | |
| 695 | } | |
| 696 | ||
| 697 | /** | |
| 698 | * Performs a binary search of the entity array for the specified key. This method is based on code in | |
| 699 | * {@link java.util.Arrays}. | |
| 700 | * | |
| 701 | * @param key | |
| 702 | * the key to be found | |
| 703 | * @return the index of the entity array matching the specified key | |
| 704 | */ | |
| 705 | private int binarySearch(int key) { | |
| 706 | int low = 0; | |
| 707 | int high = size - 1; | |
| 708 | ||
| 709 | while (low <= high) { | |
| 710 | int mid = (low + high) >> 1; | |
| 711 | int midVal = values[mid]; | |
| 712 | ||
| 713 | if (midVal < key) { | |
| 714 | low = mid + 1; | |
| 715 | } else if (midVal > key) { | |
| 716 | high = mid - 1; | |
| 717 | } else { | |
| 718 | return mid; // key found | |
| 719 | } | |
| 720 | } | |
| 721 | return -(low + 1); // key not found. | |
| 722 | } | |
| 723 | ||
| 724 | /** | |
| 725 | * {@inheritDoc} | |
| 726 | */ | |
| 727 | public void add(String name, int value) { | |
| 728 | ensureCapacity(size + 1); | |
| 729 | int insertAt = binarySearch(value); | |
| 730 | if (insertAt > 0) { | |
| 731 | return; // note: this means you can't insert the same value twice | |
| 732 | } | |
| 733 | insertAt = -(insertAt + 1); // binarySearch returns it negative and off-by-one | |
| 734 | System.arraycopy(values, insertAt, values, insertAt + 1, size - insertAt); | |
| 735 | values[insertAt] = value; | |
| 736 | System.arraycopy(names, insertAt, names, insertAt + 1, size - insertAt); | |
| 737 | names[insertAt] = name; | |
| 738 | size++; | |
| 739 | } | |
| 740 | ||
| 741 | /** | |
| 742 | * {@inheritDoc} | |
| 743 | */ | |
| 744 | public String name(int value) { | |
| 745 | int index = binarySearch(value); | |
| 746 | if (index < 0) { | |
| 747 | return null; | |
| 748 | } | |
| 749 | return names[index]; | |
| 750 | } | |
| 751 | } | |
| 752 | ||
| 753 | // package scoped for testing | |
| 754 | EntityMap map = new Entities.LookupEntityMap(); | |
| 755 | ||
| 756 | /** | |
| 757 | * <p> | |
| 758 | * Adds entities to this entity. | |
| 759 | * </p> | |
| 760 | * | |
| 761 | * @param entityArray | |
| 762 | * array of entities to be added | |
| 763 | */ | |
| 764 | public void addEntities(String[][] entityArray) { | |
| 765 | for (int i = 0; i < entityArray.length; ++i) { | |
| 766 | addEntity(entityArray[i][0], Integer.parseInt(entityArray[i][1])); | |
| 767 | } | |
| 768 | } | |
| 769 | ||
| 770 | /** | |
| 771 | * <p> | |
| 772 | * Add an entity to this entity. | |
| 773 | * </p> | |
| 774 | * | |
| 775 | * @param name | |
| 776 | * name of the entity | |
| 777 | * @param value | |
| 778 | * vale of the entity | |
| 779 | */ | |
| 780 | public void addEntity(String name, int value) { | |
| 781 | map.add(name, value); | |
| 782 | } | |
| 783 | ||
| 784 | /** | |
| 785 | * <p> | |
| 786 | * Returns the name of the entity identified by the specified value. | |
| 787 | * </p> | |
| 788 | * | |
| 789 | * @param value | |
| 790 | * the value to locate | |
| 791 | * @return entity name associated with the specified value | |
| 792 | */ | |
| 793 | public String entityName(int value) { | |
| 794 | return map.name(value); | |
| 795 | } | |
| 796 | ||
| 797 | /** | |
| 798 | * <p> | |
| 799 | * Returns the value of the entity identified by the specified name. | |
| 800 | * </p> | |