You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

LexerTest.php 26KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819
  1. <?php
  2. class HTMLPurifier_LexerTest extends HTMLPurifier_Harness
  3. {
  4. protected $_has_pear = false;
  5. public function __construct()
  6. {
  7. parent::__construct();
  8. if ($GLOBALS['HTMLPurifierTest']['PH5P']) {
  9. require_once 'HTMLPurifier/Lexer/PH5P.php';
  10. }
  11. }
  12. // HTMLPurifier_Lexer::create() --------------------------------------------
  13. public function test_create()
  14. {
  15. $this->config->set('Core.MaintainLineNumbers', true);
  16. $lexer = HTMLPurifier_Lexer::create($this->config);
  17. $this->assertIsA($lexer, 'HTMLPurifier_Lexer_DirectLex');
  18. }
  19. public function test_create_objectLexerImpl()
  20. {
  21. $this->config->set('Core.LexerImpl', new HTMLPurifier_Lexer_DirectLex());
  22. $lexer = HTMLPurifier_Lexer::create($this->config);
  23. $this->assertIsA($lexer, 'HTMLPurifier_Lexer_DirectLex');
  24. }
  25. public function test_create_unknownLexer()
  26. {
  27. $this->config->set('Core.LexerImpl', 'AsdfAsdf');
  28. $this->expectException(new HTMLPurifier_Exception('Cannot instantiate unrecognized Lexer type AsdfAsdf'));
  29. HTMLPurifier_Lexer::create($this->config);
  30. }
  31. public function test_create_incompatibleLexer()
  32. {
  33. $this->config->set('Core.LexerImpl', 'DOMLex');
  34. $this->config->set('Core.MaintainLineNumbers', true);
  35. $this->expectException(new HTMLPurifier_Exception('Cannot use lexer that does not support line numbers with Core.MaintainLineNumbers or Core.CollectErrors (use DirectLex instead)'));
  36. HTMLPurifier_Lexer::create($this->config);
  37. }
  38. // HTMLPurifier_Lexer->parseData() -----------------------------------------
  39. public function assertParseData($input, $expect = true)
  40. {
  41. if ($expect === true) $expect = $input;
  42. $lexer = new HTMLPurifier_Lexer();
  43. $this->assertIdentical($expect, $lexer->parseData($input));
  44. }
  45. public function test_parseData_plainText()
  46. {
  47. $this->assertParseData('asdf');
  48. }
  49. public function test_parseData_ampersandEntity()
  50. {
  51. $this->assertParseData('&amp;', '&');
  52. }
  53. public function test_parseData_quotEntity()
  54. {
  55. $this->assertParseData('&quot;', '"');
  56. }
  57. public function test_parseData_aposNumericEntity()
  58. {
  59. $this->assertParseData('&#039;', "'");
  60. }
  61. public function test_parseData_aposCompactNumericEntity()
  62. {
  63. $this->assertParseData('&#39;', "'");
  64. }
  65. public function test_parseData_adjacentAmpersandEntities()
  66. {
  67. $this->assertParseData('&amp;&amp;&amp;', '&&&');
  68. }
  69. public function test_parseData_trailingUnescapedAmpersand()
  70. {
  71. $this->assertParseData('&amp;&', '&&');
  72. }
  73. public function test_parseData_internalUnescapedAmpersand()
  74. {
  75. $this->assertParseData('Procter & Gamble');
  76. }
  77. public function test_parseData_improperEntityFaultToleranceTest()
  78. {
  79. $this->assertParseData('&#x2D;');
  80. }
  81. // HTMLPurifier_Lexer->extractBody() ---------------------------------------
  82. public function assertExtractBody($text, $extract = true)
  83. {
  84. $lexer = new HTMLPurifier_Lexer();
  85. $result = $lexer->extractBody($text);
  86. if ($extract === true) $extract = $text;
  87. $this->assertIdentical($extract, $result);
  88. }
  89. public function test_extractBody_noBodyTags()
  90. {
  91. $this->assertExtractBody('<b>Bold</b>');
  92. }
  93. public function test_extractBody_lowercaseBodyTags()
  94. {
  95. $this->assertExtractBody('<html><body><b>Bold</b></body></html>', '<b>Bold</b>');
  96. }
  97. public function test_extractBody_uppercaseBodyTags()
  98. {
  99. $this->assertExtractBody('<HTML><BODY><B>Bold</B></BODY></HTML>', '<B>Bold</B>');
  100. }
  101. public function test_extractBody_realisticUseCase()
  102. {
  103. $this->assertExtractBody(
  104. '<?xml version="1.0"
  105. <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
  106. "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
  107. <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
  108. <head>
  109. <title>xyz</title>
  110. </head>
  111. <body>
  112. <form method="post" action="whatever1">
  113. <div>
  114. <input type="text" name="username" />
  115. <input type="text" name="password" />
  116. <input type="submit" />
  117. </div>
  118. </form>
  119. </body>
  120. </html>',
  121. '
  122. <form method="post" action="whatever1">
  123. <div>
  124. <input type="text" name="username" />
  125. <input type="text" name="password" />
  126. <input type="submit" />
  127. </div>
  128. </form>
  129. ');
  130. }
  131. public function test_extractBody_bodyWithAttributes()
  132. {
  133. $this->assertExtractBody('<html><body bgcolor="#F00"><b>Bold</b></body></html>', '<b>Bold</b>');
  134. }
  135. public function test_extractBody_preserveUnclosedBody()
  136. {
  137. $this->assertExtractBody('<body>asdf'); // not closed, don't accept
  138. }
  139. public function test_extractBody_useLastBody()
  140. {
  141. $this->assertExtractBody('<body>foo</body>bar</body>', 'foo</body>bar');
  142. }
  143. // HTMLPurifier_Lexer->tokenizeHTML() --------------------------------------
  144. public function assertTokenization($input, $expect, $alt_expect = array())
  145. {
  146. $lexers = array();
  147. $lexers['DirectLex'] = new HTMLPurifier_Lexer_DirectLex();
  148. if (class_exists('DOMDocument')) {
  149. $lexers['DOMLex'] = new HTMLPurifier_Lexer_DOMLex();
  150. $lexers['PH5P'] = new HTMLPurifier_Lexer_PH5P();
  151. }
  152. foreach ($lexers as $name => $lexer) {
  153. $result = $lexer->tokenizeHTML($input, $this->config, $this->context);
  154. if (isset($alt_expect[$name])) {
  155. if ($alt_expect[$name] === false) continue;
  156. $t_expect = $alt_expect[$name];
  157. $this->assertIdentical($result, $alt_expect[$name], "$name: %s");
  158. } else {
  159. $t_expect = $expect;
  160. $this->assertIdentical($result, $expect, "$name: %s");
  161. }
  162. if ($t_expect != $result) {
  163. printTokens($result);
  164. }
  165. }
  166. }
  167. public function test_tokenizeHTML_emptyInput()
  168. {
  169. $this->assertTokenization('', array());
  170. }
  171. public function test_tokenizeHTML_plainText()
  172. {
  173. $this->assertTokenization(
  174. 'This is regular text.',
  175. array(
  176. new HTMLPurifier_Token_Text('This is regular text.')
  177. )
  178. );
  179. }
  180. public function test_tokenizeHTML_textAndTags()
  181. {
  182. $this->assertTokenization(
  183. 'This is <b>bold</b> text',
  184. array(
  185. new HTMLPurifier_Token_Text('This is '),
  186. new HTMLPurifier_Token_Start('b', array()),
  187. new HTMLPurifier_Token_Text('bold'),
  188. new HTMLPurifier_Token_End('b'),
  189. new HTMLPurifier_Token_Text(' text'),
  190. )
  191. );
  192. }
  193. public function test_tokenizeHTML_normalizeCase()
  194. {
  195. $this->assertTokenization(
  196. '<DIV>Totally rad dude. <b>asdf</b></div>',
  197. array(
  198. new HTMLPurifier_Token_Start('DIV', array()),
  199. new HTMLPurifier_Token_Text('Totally rad dude. '),
  200. new HTMLPurifier_Token_Start('b', array()),
  201. new HTMLPurifier_Token_Text('asdf'),
  202. new HTMLPurifier_Token_End('b'),
  203. new HTMLPurifier_Token_End('div'),
  204. )
  205. );
  206. }
  207. public function test_tokenizeHTML_notWellFormed()
  208. {
  209. $this->assertTokenization(
  210. '<asdf></asdf><d></d><poOloka><poolasdf><ds></asdf></ASDF>',
  211. array(
  212. new HTMLPurifier_Token_Start('asdf'),
  213. new HTMLPurifier_Token_End('asdf'),
  214. new HTMLPurifier_Token_Start('d'),
  215. new HTMLPurifier_Token_End('d'),
  216. new HTMLPurifier_Token_Start('poOloka'),
  217. new HTMLPurifier_Token_Start('poolasdf'),
  218. new HTMLPurifier_Token_Start('ds'),
  219. new HTMLPurifier_Token_End('asdf'),
  220. new HTMLPurifier_Token_End('ASDF'),
  221. ),
  222. array(
  223. 'DOMLex' => $alt = array(
  224. new HTMLPurifier_Token_Empty('asdf'),
  225. new HTMLPurifier_Token_Empty('d'),
  226. new HTMLPurifier_Token_Start('pooloka'),
  227. new HTMLPurifier_Token_Start('poolasdf'),
  228. new HTMLPurifier_Token_Empty('ds'),
  229. new HTMLPurifier_Token_End('poolasdf'),
  230. new HTMLPurifier_Token_End('pooloka'),
  231. ),
  232. 'PH5P' => $alt,
  233. )
  234. );
  235. }
  236. public function test_tokenizeHTML_whitespaceInTag()
  237. {
  238. $this->assertTokenization(
  239. '<a'."\t".'href="foobar.php"'."\n".'title="foo!">Link to <b id="asdf">foobar</b></a>',
  240. array(
  241. new HTMLPurifier_Token_Start('a',array('href'=>'foobar.php','title'=>'foo!')),
  242. new HTMLPurifier_Token_Text('Link to '),
  243. new HTMLPurifier_Token_Start('b',array('id'=>'asdf')),
  244. new HTMLPurifier_Token_Text('foobar'),
  245. new HTMLPurifier_Token_End('b'),
  246. new HTMLPurifier_Token_End('a'),
  247. )
  248. );
  249. }
  250. public function test_tokenizeHTML_singleAttribute()
  251. {
  252. $this->assertTokenization(
  253. '<br style="&amp;" />',
  254. array(
  255. new HTMLPurifier_Token_Empty('br', array('style' => '&'))
  256. )
  257. );
  258. }
  259. public function test_tokenizeHTML_emptyTag()
  260. {
  261. $this->assertTokenization(
  262. '<br />',
  263. array( new HTMLPurifier_Token_Empty('br') )
  264. );
  265. }
  266. public function test_tokenizeHTML_comment()
  267. {
  268. $this->assertTokenization(
  269. '<!-- Comment -->',
  270. array( new HTMLPurifier_Token_Comment(' Comment ') )
  271. );
  272. }
  273. public function test_tokenizeHTML_malformedComment()
  274. {
  275. $this->assertTokenization(
  276. '<!-- not so well formed --->',
  277. array( new HTMLPurifier_Token_Comment(' not so well formed -') )
  278. );
  279. }
  280. public function test_tokenizeHTML_unterminatedTag()
  281. {
  282. $this->assertTokenization(
  283. '<a href=""',
  284. array( new HTMLPurifier_Token_Text('<a href=""') ),
  285. array(
  286. // I like our behavior better, but it's non-standard
  287. 'DOMLex' => array( new HTMLPurifier_Token_Empty('a', array('href'=>'')) ),
  288. 'PH5P' => false, // total barfing, grabs scaffolding too
  289. )
  290. );
  291. }
  292. public function test_tokenizeHTML_specialEntities()
  293. {
  294. $this->assertTokenization(
  295. '&lt;b&gt;',
  296. array(
  297. new HTMLPurifier_Token_Text('<b>')
  298. ),
  299. array(
  300. // some parsers will separate entities out
  301. 'PH5P' => array(
  302. new HTMLPurifier_Token_Text('<'),
  303. new HTMLPurifier_Token_Text('b'),
  304. new HTMLPurifier_Token_Text('>'),
  305. ),
  306. )
  307. );
  308. }
  309. public function test_tokenizeHTML_earlyQuote()
  310. {
  311. $this->assertTokenization(
  312. '<a "=>',
  313. array( new HTMLPurifier_Token_Empty('a') ),
  314. array(
  315. // we barf on this input
  316. 'DirectLex' => array(
  317. new HTMLPurifier_Token_Start('a', array('"' => ''))
  318. ),
  319. 'PH5P' => false, // behavior varies; handle this personally
  320. )
  321. );
  322. }
  323. public function test_tokenizeHTML_earlyQuote_PH5P()
  324. {
  325. if (!class_exists('DOMDocument')) return;
  326. $lexer = new HTMLPurifier_Lexer_PH5P();
  327. $result = $lexer->tokenizeHTML('<a "=>', $this->config, $this->context);
  328. if ($this->context->get('PH5PError', true)) {
  329. $this->assertIdentical(array(
  330. new HTMLPurifier_Token_Start('a', array('"' => ''))
  331. ), $result);
  332. } else {
  333. $this->assertIdentical(array(
  334. new HTMLPurifier_Token_Empty('a', array('"' => ''))
  335. ), $result);
  336. }
  337. }
  338. public function test_tokenizeHTML_unescapedQuote()
  339. {
  340. $this->assertTokenization(
  341. '"',
  342. array( new HTMLPurifier_Token_Text('"') )
  343. );
  344. }
  345. public function test_tokenizeHTML_escapedQuote()
  346. {
  347. $this->assertTokenization(
  348. '&quot;',
  349. array( new HTMLPurifier_Token_Text('"') )
  350. );
  351. }
  352. public function test_tokenizeHTML_cdata()
  353. {
  354. $this->assertTokenization(
  355. '<![CDATA[You <b>can&#39;t</b> get me!]]>',
  356. array( new HTMLPurifier_Token_Text('You <b>can&#39;t</b> get me!') ),
  357. array(
  358. 'PH5P' => array(
  359. new HTMLPurifier_Token_Text('You '),
  360. new HTMLPurifier_Token_Text('<'),
  361. new HTMLPurifier_Token_Text('b'),
  362. new HTMLPurifier_Token_Text('>'),
  363. new HTMLPurifier_Token_Text('can'),
  364. new HTMLPurifier_Token_Text('&'),
  365. new HTMLPurifier_Token_Text('#39;t'),
  366. new HTMLPurifier_Token_Text('<'),
  367. new HTMLPurifier_Token_Text('/b'),
  368. new HTMLPurifier_Token_Text('>'),
  369. new HTMLPurifier_Token_Text(' get me!'),
  370. ),
  371. )
  372. );
  373. }
  374. public function test_tokenizeHTML_characterEntity()
  375. {
  376. $this->assertTokenization(
  377. '&theta;',
  378. array( new HTMLPurifier_Token_Text("\xCE\xB8") )
  379. );
  380. }
  381. public function test_tokenizeHTML_characterEntityInCDATA()
  382. {
  383. $this->assertTokenization(
  384. '<![CDATA[&rarr;]]>',
  385. array( new HTMLPurifier_Token_Text("&rarr;") ),
  386. array(
  387. 'PH5P' => array(
  388. new HTMLPurifier_Token_Text('&'),
  389. new HTMLPurifier_Token_Text('rarr;'),
  390. ),
  391. )
  392. );
  393. }
  394. public function test_tokenizeHTML_entityInAttribute()
  395. {
  396. $this->assertTokenization(
  397. '<a href="index.php?title=foo&amp;id=bar">Link</a>',
  398. array(
  399. new HTMLPurifier_Token_Start('a',array('href' => 'index.php?title=foo&id=bar')),
  400. new HTMLPurifier_Token_Text('Link'),
  401. new HTMLPurifier_Token_End('a'),
  402. )
  403. );
  404. }
  405. public function test_tokenizeHTML_preserveUTF8()
  406. {
  407. $this->assertTokenization(
  408. "\xCE\xB8",
  409. array( new HTMLPurifier_Token_Text("\xCE\xB8") )
  410. );
  411. }
  412. public function test_tokenizeHTML_specialEntityInAttribute()
  413. {
  414. $this->assertTokenization(
  415. '<br test="x &lt; 6" />',
  416. array( new HTMLPurifier_Token_Empty('br', array('test' => 'x < 6')) )
  417. );
  418. }
  419. public function test_tokenizeHTML_emoticonProtection()
  420. {
  421. $this->assertTokenization(
  422. '<b>Whoa! <3 That\'s not good >.></b>',
  423. array(
  424. new HTMLPurifier_Token_Start('b'),
  425. new HTMLPurifier_Token_Text('Whoa! '),
  426. new HTMLPurifier_Token_Text('<'),
  427. new HTMLPurifier_Token_Text('3 That\'s not good >.>'),
  428. new HTMLPurifier_Token_End('b')
  429. ),
  430. array(
  431. // text is absorbed together
  432. 'DOMLex' => array(
  433. new HTMLPurifier_Token_Start('b'),
  434. new HTMLPurifier_Token_Text('Whoa! <3 That\'s not good >.>'),
  435. new HTMLPurifier_Token_End('b'),
  436. ),
  437. 'PH5P' => array( // interesting grouping
  438. new HTMLPurifier_Token_Start('b'),
  439. new HTMLPurifier_Token_Text('Whoa! '),
  440. new HTMLPurifier_Token_Text('<'),
  441. new HTMLPurifier_Token_Text('3 That\'s not good >.>'),
  442. new HTMLPurifier_Token_End('b'),
  443. ),
  444. )
  445. );
  446. }
  447. public function test_tokenizeHTML_commentWithFunkyChars()
  448. {
  449. $this->assertTokenization(
  450. '<!-- This >< comment --><br />',
  451. array(
  452. new HTMLPurifier_Token_Comment(' This >< comment '),
  453. new HTMLPurifier_Token_Empty('br'),
  454. )
  455. );
  456. }
  457. public function test_tokenizeHTML_unterminatedComment()
  458. {
  459. $this->assertTokenization(
  460. '<!-- This >< comment',
  461. array( new HTMLPurifier_Token_Comment(' This >< comment') ),
  462. array(
  463. 'DOMLex' => false,
  464. 'PH5P' => false,
  465. )
  466. );
  467. }
  468. public function test_tokenizeHTML_scriptCDATAContents()
  469. {
  470. $this->config->set('HTML.Trusted', true);
  471. $this->assertTokenization(
  472. 'Foo: <script>alert("<foo>");</script>',
  473. array(
  474. new HTMLPurifier_Token_Text('Foo: '),
  475. new HTMLPurifier_Token_Start('script'),
  476. new HTMLPurifier_Token_Text('alert("<foo>");'),
  477. new HTMLPurifier_Token_End('script'),
  478. ),
  479. array(
  480. // PH5P, for some reason, bubbles the script to <head>
  481. 'PH5P' => false,
  482. )
  483. );
  484. }
  485. public function test_tokenizeHTML_entitiesInComment()
  486. {
  487. $this->assertTokenization(
  488. '<!-- This comment < &lt; & -->',
  489. array( new HTMLPurifier_Token_Comment(' This comment < &lt; & ') )
  490. );
  491. }
  492. public function test_tokenizeHTML_attributeWithSpecialCharacters()
  493. {
  494. $this->assertTokenization(
  495. '<a href="><>">',
  496. array( new HTMLPurifier_Token_Empty('a', array('href' => '><>')) ),
  497. array(
  498. 'DirectLex' => array(
  499. new HTMLPurifier_Token_Start('a', array('href' => '')),
  500. new HTMLPurifier_Token_Text('<'),
  501. new HTMLPurifier_Token_Text('">'),
  502. )
  503. )
  504. );
  505. }
  506. public function test_tokenizeHTML_emptyTagWithSlashInAttribute()
  507. {
  508. $this->assertTokenization(
  509. '<param name="src" value="http://example.com/video.wmv" />',
  510. array( new HTMLPurifier_Token_Empty('param', array('name' => 'src', 'value' => 'http://example.com/video.wmv')) )
  511. );
  512. }
  513. public function test_tokenizeHTML_style()
  514. {
  515. $extra = array(
  516. // PH5P doesn't seem to like style tags
  517. 'PH5P' => false,
  518. // DirectLex defers to RemoveForeignElements for textification
  519. 'DirectLex' => array(
  520. new HTMLPurifier_Token_Start('style', array('type' => 'text/css')),
  521. new HTMLPurifier_Token_Comment("\ndiv {}\n"),
  522. new HTMLPurifier_Token_End('style'),
  523. ),
  524. );
  525. if (!defined('LIBXML_VERSION')) {
  526. // LIBXML_VERSION is missing in early versions of PHP
  527. // prior to 1.30 of php-src/ext/libxml/libxml.c (version-wise,
  528. // this translates to 5.0.x. In such cases, punt the test entirely.
  529. return;
  530. } elseif (LIBXML_VERSION < 20628) {
  531. // libxml's behavior is wrong prior to this version, so make
  532. // appropriate accomodations
  533. $extra['DOMLex'] = $extra['DirectLex'];
  534. }
  535. $this->assertTokenization(
  536. '<style type="text/css"><!--
  537. div {}
  538. --></style>',
  539. array(
  540. new HTMLPurifier_Token_Start('style', array('type' => 'text/css')),
  541. new HTMLPurifier_Token_Text("\ndiv {}\n"),
  542. new HTMLPurifier_Token_End('style'),
  543. ),
  544. $extra
  545. );
  546. }
  547. public function test_tokenizeHTML_tagWithAtSignAndExtraGt()
  548. {
  549. $alt_expect = array(
  550. // Technically this is invalid, but it won't be a
  551. // problem with invalid element removal; also, this
  552. // mimics Mozilla's parsing of the tag.
  553. new HTMLPurifier_Token_Start('a@'),
  554. new HTMLPurifier_Token_Text('>'),
  555. );
  556. $this->assertTokenization(
  557. '<a@>>',
  558. array(
  559. new HTMLPurifier_Token_Start('a'),
  560. new HTMLPurifier_Token_Text('>'),
  561. new HTMLPurifier_Token_End('a'),
  562. ),
  563. array(
  564. 'DirectLex' => $alt_expect,
  565. )
  566. );
  567. }
  568. public function test_tokenizeHTML_emoticonHeart()
  569. {
  570. $this->assertTokenization(
  571. '<br /><3<br />',
  572. array(
  573. new HTMLPurifier_Token_Empty('br'),
  574. new HTMLPurifier_Token_Text('<'),
  575. new HTMLPurifier_Token_Text('3'),
  576. new HTMLPurifier_Token_Empty('br'),
  577. ),
  578. array(
  579. 'DOMLex' => array(
  580. new HTMLPurifier_Token_Empty('br'),
  581. new HTMLPurifier_Token_Text('<3'),
  582. new HTMLPurifier_Token_Empty('br'),
  583. ),
  584. )
  585. );
  586. }
  587. public function test_tokenizeHTML_emoticonShiftyEyes()
  588. {
  589. $this->assertTokenization(
  590. '<b><<</b>',
  591. array(
  592. new HTMLPurifier_Token_Start('b'),
  593. new HTMLPurifier_Token_Text('<'),
  594. new HTMLPurifier_Token_Text('<'),
  595. new HTMLPurifier_Token_End('b'),
  596. ),
  597. array(
  598. 'DOMLex' => array(
  599. new HTMLPurifier_Token_Start('b'),
  600. new HTMLPurifier_Token_Text('<<'),
  601. new HTMLPurifier_Token_End('b'),
  602. ),
  603. )
  604. );
  605. }
  606. public function test_tokenizeHTML_eon1996()
  607. {
  608. $this->assertTokenization(
  609. '< <b>test</b>',
  610. array(
  611. new HTMLPurifier_Token_Text('<'),
  612. new HTMLPurifier_Token_Text(' '),
  613. new HTMLPurifier_Token_Start('b'),
  614. new HTMLPurifier_Token_Text('test'),
  615. new HTMLPurifier_Token_End('b'),
  616. ),
  617. array(
  618. 'DOMLex' => array(
  619. new HTMLPurifier_Token_Text('< '),
  620. new HTMLPurifier_Token_Start('b'),
  621. new HTMLPurifier_Token_Text('test'),
  622. new HTMLPurifier_Token_End('b'),
  623. ),
  624. )
  625. );
  626. }
  627. public function test_tokenizeHTML_bodyInCDATA()
  628. {
  629. $alt_tokens = array(
  630. new HTMLPurifier_Token_Text('<'),
  631. new HTMLPurifier_Token_Text('body'),
  632. new HTMLPurifier_Token_Text('>'),
  633. new HTMLPurifier_Token_Text('Foo'),
  634. new HTMLPurifier_Token_Text('<'),
  635. new HTMLPurifier_Token_Text('/body'),
  636. new HTMLPurifier_Token_Text('>'),
  637. );
  638. $this->assertTokenization(
  639. '<![CDATA[<body>Foo</body>]]>',
  640. array(
  641. new HTMLPurifier_Token_Text('<body>Foo</body>'),
  642. ),
  643. array(
  644. 'PH5P' => $alt_tokens,
  645. )
  646. );
  647. }
  648. public function test_tokenizeHTML_()
  649. {
  650. $this->assertTokenization(
  651. '<a><img /></a>',
  652. array(
  653. new HTMLPurifier_Token_Start('a'),
  654. new HTMLPurifier_Token_Empty('img'),
  655. new HTMLPurifier_Token_End('a'),
  656. )
  657. );
  658. }
  659. public function test_tokenizeHTML_ignoreIECondComment()
  660. {
  661. $this->assertTokenization(
  662. '<!--[if IE]>foo<a>bar<!-- baz --><![endif]-->',
  663. array()
  664. );
  665. }
  666. public function test_tokenizeHTML_removeProcessingInstruction()
  667. {
  668. $this->config->set('Core.RemoveProcessingInstructions', true);
  669. $this->assertTokenization(
  670. '<?xml blah blah ?>',
  671. array()
  672. );
  673. }
  674. public function test_tokenizeHTML_removeNewline()
  675. {
  676. $this->config->set('Core.NormalizeNewlines', true);
  677. $this->assertTokenization(
  678. "plain\rtext\r\n",
  679. array(
  680. new HTMLPurifier_Token_Text("plain\ntext\n")
  681. )
  682. );
  683. }
  684. public function test_tokenizeHTML_noRemoveNewline()
  685. {
  686. $this->config->set('Core.NormalizeNewlines', false);
  687. $this->assertTokenization(
  688. "plain\rtext\r\n",
  689. array(
  690. new HTMLPurifier_Token_Text("plain\rtext\r\n")
  691. )
  692. );
  693. }
  694. public function test_tokenizeHTML_conditionalCommentUngreedy()
  695. {
  696. $this->assertTokenization(
  697. '<!--[if gte mso 9]>a<![endif]-->b<!--[if gte mso 9]>c<![endif]-->',
  698. array(
  699. new HTMLPurifier_Token_Text("b")
  700. )
  701. );
  702. }
  703. public function test_tokenizeHTML_imgTag()
  704. {
  705. $start = array(
  706. new HTMLPurifier_Token_Start('img',
  707. array(
  708. 'src' => 'img_11775.jpg',
  709. 'alt' => '[Img #11775]',
  710. 'id' => 'EMBEDDED_IMG_11775',
  711. )
  712. )
  713. );
  714. $this->assertTokenization(
  715. '<img src="img_11775.jpg" alt="[Img #11775]" id="EMBEDDED_IMG_11775" >',
  716. array(
  717. new HTMLPurifier_Token_Empty('img',
  718. array(
  719. 'src' => 'img_11775.jpg',
  720. 'alt' => '[Img #11775]',
  721. 'id' => 'EMBEDDED_IMG_11775',
  722. )
  723. )
  724. ),
  725. array(
  726. 'DirectLex' => $start,
  727. )
  728. );
  729. }
  730. /*
  731. public function test_tokenizeHTML_()
  732. {
  733. $this->assertTokenization(
  734. ,
  735. array(
  736. )
  737. );
  738. }
  739. */
  740. }
  741. // vim: et sw=4 sts=4