You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

239 lines
8.5KB

  1. <?php
  2. class HTMLPurifier_EncoderTest extends HTMLPurifier_Harness
  3. {
  4. protected $_entity_lookup;
  5. public function setUp()
  6. {
  7. $this->_entity_lookup = HTMLPurifier_EntityLookup::instance();
  8. parent::setUp();
  9. }
  10. public function assertCleanUTF8($string, $expect = null)
  11. {
  12. if ($expect === null) $expect = $string;
  13. $this->assertIdentical(HTMLPurifier_Encoder::cleanUTF8($string), $expect, 'iconv: %s');
  14. $this->assertIdentical(HTMLPurifier_Encoder::cleanUTF8($string, true), $expect, 'PHP: %s');
  15. }
  16. public function test_cleanUTF8()
  17. {
  18. $this->assertCleanUTF8('Normal string.');
  19. $this->assertCleanUTF8("Test\tAllowed\nControl\rCharacters");
  20. $this->assertCleanUTF8("null byte: \0", 'null byte: ');
  21. $this->assertCleanUTF8("\1\2\3\4\5\6\7", '');
  22. $this->assertCleanUTF8("\x7F", ''); // one byte invalid SGML char
  23. $this->assertCleanUTF8("\xC2\x80", ''); // two byte invalid SGML
  24. $this->assertCleanUTF8("\xF3\xBF\xBF\xBF"); // valid four byte
  25. $this->assertCleanUTF8("\xDF\xFF", ''); // malformed UTF8
  26. // invalid codepoints
  27. $this->assertCleanUTF8("\xED\xB0\x80", '');
  28. }
  29. public function test_convertToUTF8_noConvert()
  30. {
  31. // UTF-8 means that we don't touch it
  32. $this->assertIdentical(
  33. HTMLPurifier_Encoder::convertToUTF8("\xF6", $this->config, $this->context),
  34. "\xF6", // this is invalid
  35. 'Expected identical [Binary: F6]'
  36. );
  37. }
  38. public function test_convertToUTF8_spuriousEncoding()
  39. {
  40. if (!HTMLPurifier_Encoder::iconvAvailable()) return;
  41. $this->config->set('Core.Encoding', 'utf99');
  42. $this->expectError('Invalid encoding utf99');
  43. $this->assertIdentical(
  44. HTMLPurifier_Encoder::convertToUTF8("\xF6", $this->config, $this->context),
  45. ''
  46. );
  47. }
  48. public function test_convertToUTF8_iso8859_1()
  49. {
  50. $this->config->set('Core.Encoding', 'ISO-8859-1');
  51. $this->assertIdentical(
  52. HTMLPurifier_Encoder::convertToUTF8("\xF6", $this->config, $this->context),
  53. "\xC3\xB6"
  54. );
  55. }
  56. public function test_convertToUTF8_withoutIconv()
  57. {
  58. $this->config->set('Core.Encoding', 'ISO-8859-1');
  59. $this->config->set('Test.ForceNoIconv', true);
  60. $this->assertIdentical(
  61. HTMLPurifier_Encoder::convertToUTF8("\xF6", $this->config, $this->context),
  62. "\xC3\xB6"
  63. );
  64. }
  65. public function getZhongWen()
  66. {
  67. return "\xE4\xB8\xAD\xE6\x96\x87 (Chinese)";
  68. }
  69. public function test_convertFromUTF8_utf8()
  70. {
  71. // UTF-8 means that we don't touch it
  72. $this->assertIdentical(
  73. HTMLPurifier_Encoder::convertFromUTF8("\xC3\xB6", $this->config, $this->context),
  74. "\xC3\xB6"
  75. );
  76. }
  77. public function test_convertFromUTF8_iso8859_1()
  78. {
  79. $this->config->set('Core.Encoding', 'ISO-8859-1');
  80. $this->assertIdentical(
  81. HTMLPurifier_Encoder::convertFromUTF8("\xC3\xB6", $this->config, $this->context),
  82. "\xF6",
  83. 'Expected identical [Binary: F6]'
  84. );
  85. }
  86. public function test_convertFromUTF8_iconvNoChars()
  87. {
  88. if (!HTMLPurifier_Encoder::iconvAvailable()) return;
  89. $this->config->set('Core.Encoding', 'ISO-8859-1');
  90. $this->assertIdentical(
  91. HTMLPurifier_Encoder::convertFromUTF8($this->getZhongWen(), $this->config, $this->context),
  92. " (Chinese)"
  93. );
  94. }
  95. public function test_convertFromUTF8_phpNormal()
  96. {
  97. // Plain PHP implementation has slightly different behavior
  98. $this->config->set('Core.Encoding', 'ISO-8859-1');
  99. $this->config->set('Test.ForceNoIconv', true);
  100. $this->assertIdentical(
  101. HTMLPurifier_Encoder::convertFromUTF8("\xC3\xB6", $this->config, $this->context),
  102. "\xF6",
  103. 'Expected identical [Binary: F6]'
  104. );
  105. }
  106. public function test_convertFromUTF8_phpNoChars()
  107. {
  108. $this->config->set('Core.Encoding', 'ISO-8859-1');
  109. $this->config->set('Test.ForceNoIconv', true);
  110. $this->assertIdentical(
  111. HTMLPurifier_Encoder::convertFromUTF8($this->getZhongWen(), $this->config, $this->context),
  112. "?? (Chinese)"
  113. );
  114. }
  115. public function test_convertFromUTF8_withProtection()
  116. {
  117. // Preserve the characters!
  118. $this->config->set('Core.Encoding', 'ISO-8859-1');
  119. $this->config->set('Core.EscapeNonASCIICharacters', true);
  120. $this->assertIdentical(
  121. HTMLPurifier_Encoder::convertFromUTF8($this->getZhongWen(), $this->config, $this->context),
  122. "&#20013;&#25991; (Chinese)"
  123. );
  124. }
  125. public function test_convertFromUTF8_withProtectionButUtf8()
  126. {
  127. // Preserve the characters!
  128. $this->config->set('Core.EscapeNonASCIICharacters', true);
  129. $this->assertIdentical(
  130. HTMLPurifier_Encoder::convertFromUTF8($this->getZhongWen(), $this->config, $this->context),
  131. "&#20013;&#25991; (Chinese)"
  132. );
  133. }
  134. public function test_convertToASCIIDumbLossless()
  135. {
  136. // Uppercase thorn letter
  137. $this->assertIdentical(
  138. HTMLPurifier_Encoder::convertToASCIIDumbLossless("\xC3\x9Eorn"),
  139. "&#222;orn"
  140. );
  141. $this->assertIdentical(
  142. HTMLPurifier_Encoder::convertToASCIIDumbLossless("an"),
  143. "an"
  144. );
  145. // test up to four bytes
  146. $this->assertIdentical(
  147. HTMLPurifier_Encoder::convertToASCIIDumbLossless("\xF3\xA0\x80\xA0"),
  148. "&#917536;"
  149. );
  150. }
  151. public function assertASCIISupportCheck($enc, $ret)
  152. {
  153. $test = HTMLPurifier_Encoder::testEncodingSupportsASCII($enc, true);
  154. if ($test === false) return;
  155. $this->assertIdentical(
  156. HTMLPurifier_Encoder::testEncodingSupportsASCII($enc),
  157. $ret
  158. );
  159. $this->assertIdentical(
  160. HTMLPurifier_Encoder::testEncodingSupportsASCII($enc, true),
  161. $ret
  162. );
  163. }
  164. public function test_testEncodingSupportsASCII()
  165. {
  166. if (HTMLPurifier_Encoder::iconvAvailable()) {
  167. $this->assertASCIISupportCheck('Shift_JIS', array("\xC2\xA5" => '\\', "\xE2\x80\xBE" => '~'));
  168. $this->assertASCIISupportCheck('JOHAB', array("\xE2\x82\xA9" => '\\'));
  169. }
  170. $this->assertASCIISupportCheck('ISO-8859-1', array());
  171. $this->assertASCIISupportCheck('dontexist', array()); // canary
  172. }
  173. public function testShiftJIS()
  174. {
  175. if (!HTMLPurifier_Encoder::iconvAvailable()) return;
  176. $this->config->set('Core.Encoding', 'Shift_JIS');
  177. // This actually looks like a Yen, but we're going to treat it differently
  178. $this->assertIdentical(
  179. HTMLPurifier_Encoder::convertFromUTF8('\\~', $this->config, $this->context),
  180. '\\~'
  181. );
  182. $this->assertIdentical(
  183. HTMLPurifier_Encoder::convertToUTF8('\\~', $this->config, $this->context),
  184. '\\~'
  185. );
  186. }
  187. public function testIconvTruncateBug()
  188. {
  189. if (!HTMLPurifier_Encoder::iconvAvailable()) return;
  190. if (HTMLPurifier_Encoder::testIconvTruncateBug() !== HTMLPurifier_Encoder::ICONV_TRUNCATES) return;
  191. $this->config->set('Core.Encoding', 'ISO-8859-1');
  192. $this->assertIdentical(
  193. HTMLPurifier_Encoder::convertFromUTF8("\xE4\xB8\xAD" . str_repeat('a', 10000), $this->config, $this->context),
  194. str_repeat('a', 10000)
  195. );
  196. }
  197. public function testIconvChunking()
  198. {
  199. if (!HTMLPurifier_Encoder::iconvAvailable()) return;
  200. if (HTMLPurifier_Encoder::testIconvTruncateBug() !== HTMLPurifier_Encoder::ICONV_TRUNCATES) return;
  201. $this->assertIdentical(HTMLPurifier_Encoder::iconv('utf-8', 'iso-8859-1//IGNORE', "a\xF3\xA0\x80\xA0b", 4), 'ab');
  202. $this->assertIdentical(HTMLPurifier_Encoder::iconv('utf-8', 'iso-8859-1//IGNORE', "aa\xE4\xB8\xADb", 4), 'aab');
  203. $this->assertIdentical(HTMLPurifier_Encoder::iconv('utf-8', 'iso-8859-1//IGNORE', "aaa\xCE\xB1b", 4), 'aaab');
  204. $this->assertIdentical(HTMLPurifier_Encoder::iconv('utf-8', 'iso-8859-1//IGNORE', "aaaa\xF3\xA0\x80\xA0b", 4), 'aaaab');
  205. $this->assertIdentical(HTMLPurifier_Encoder::iconv('utf-8', 'iso-8859-1//IGNORE', "aaaa\xE4\xB8\xADb", 4), 'aaaab');
  206. $this->assertIdentical(HTMLPurifier_Encoder::iconv('utf-8', 'iso-8859-1//IGNORE', "aaaa\xCE\xB1b", 4), 'aaaab');
  207. }
  208. }
  209. // vim: et sw=4 sts=4