You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

601 lines
24KB

  1. <?php
  2. /**
  3. * Takes tokens makes them well-formed (balance end tags, etc.)
  4. *
  5. * Specification of the armor attributes this strategy uses:
  6. *
  7. * - MakeWellFormed_TagClosedError: This armor field is used to
  8. * suppress tag closed errors for certain tokens [TagClosedSuppress],
  9. * in particular, if a tag was generated automatically by HTML
  10. * Purifier, we may rely on our infrastructure to close it for us
  11. * and shouldn't report an error to the user [TagClosedAuto].
  12. */
  13. class HTMLPurifier_Strategy_MakeWellFormed extends HTMLPurifier_Strategy
  14. {
  15. /**
  16. * Array stream of tokens being processed.
  17. * @type HTMLPurifier_Token[]
  18. */
  19. protected $tokens;
  20. /**
  21. * Current token.
  22. * @type HTMLPurifier_Token
  23. */
  24. protected $token;
  25. /**
  26. * Zipper managing the true state.
  27. * @type HTMLPurifier_Zipper
  28. */
  29. protected $zipper;
  30. /**
  31. * Current nesting of elements.
  32. * @type array
  33. */
  34. protected $stack;
  35. /**
  36. * Injectors active in this stream processing.
  37. * @type HTMLPurifier_Injector[]
  38. */
  39. protected $injectors;
  40. /**
  41. * Current instance of HTMLPurifier_Config.
  42. * @type HTMLPurifier_Config
  43. */
  44. protected $config;
  45. /**
  46. * Current instance of HTMLPurifier_Context.
  47. * @type HTMLPurifier_Context
  48. */
  49. protected $context;
  50. /**
  51. * @param HTMLPurifier_Token[] $tokens
  52. * @param HTMLPurifier_Config $config
  53. * @param HTMLPurifier_Context $context
  54. * @return HTMLPurifier_Token[]
  55. * @throws HTMLPurifier_Exception
  56. */
  57. public function execute($tokens, $config, $context)
  58. {
  59. $definition = $config->getHTMLDefinition();
  60. // local variables
  61. $generator = new HTMLPurifier_Generator($config, $context);
  62. $escape_invalid_tags = $config->get('Core.EscapeInvalidTags');
  63. // used for autoclose early abortion
  64. $global_parent_allowed_elements = $definition->info_parent_def->child->getAllowedElements($config);
  65. $e = $context->get('ErrorCollector', true);
  66. $i = false; // injector index
  67. list($zipper, $token) = HTMLPurifier_Zipper::fromArray($tokens);
  68. if ($token === NULL) {
  69. return array();
  70. }
  71. $reprocess = false; // whether or not to reprocess the same token
  72. $stack = array();
  73. // member variables
  74. $this->stack =& $stack;
  75. $this->tokens =& $tokens;
  76. $this->token =& $token;
  77. $this->zipper =& $zipper;
  78. $this->config = $config;
  79. $this->context = $context;
  80. // context variables
  81. $context->register('CurrentNesting', $stack);
  82. $context->register('InputZipper', $zipper);
  83. $context->register('CurrentToken', $token);
  84. // -- begin INJECTOR --
  85. $this->injectors = array();
  86. $injectors = $config->getBatch('AutoFormat');
  87. $def_injectors = $definition->info_injector;
  88. $custom_injectors = $injectors['Custom'];
  89. unset($injectors['Custom']); // special case
  90. foreach ($injectors as $injector => $b) {
  91. // XXX: Fix with a legitimate lookup table of enabled filters
  92. if (strpos($injector, '.') !== false) {
  93. continue;
  94. }
  95. $injector = "HTMLPurifier_Injector_$injector";
  96. if (!$b) {
  97. continue;
  98. }
  99. $this->injectors[] = new $injector;
  100. }
  101. foreach ($def_injectors as $injector) {
  102. // assumed to be objects
  103. $this->injectors[] = $injector;
  104. }
  105. foreach ($custom_injectors as $injector) {
  106. if (!$injector) {
  107. continue;
  108. }
  109. if (is_string($injector)) {
  110. $injector = "HTMLPurifier_Injector_$injector";
  111. $injector = new $injector;
  112. }
  113. $this->injectors[] = $injector;
  114. }
  115. // give the injectors references to the definition and context
  116. // variables for performance reasons
  117. foreach ($this->injectors as $ix => $injector) {
  118. $error = $injector->prepare($config, $context);
  119. if (!$error) {
  120. continue;
  121. }
  122. array_splice($this->injectors, $ix, 1); // rm the injector
  123. trigger_error("Cannot enable {$injector->name} injector because $error is not allowed", E_USER_WARNING);
  124. }
  125. // -- end INJECTOR --
  126. // a note on reprocessing:
  127. // In order to reduce code duplication, whenever some code needs
  128. // to make HTML changes in order to make things "correct", the
  129. // new HTML gets sent through the purifier, regardless of its
  130. // status. This means that if we add a start token, because it
  131. // was totally necessary, we don't have to update nesting; we just
  132. // punt ($reprocess = true; continue;) and it does that for us.
  133. // isset is in loop because $tokens size changes during loop exec
  134. for (;;
  135. // only increment if we don't need to reprocess
  136. $reprocess ? $reprocess = false : $token = $zipper->next($token)) {
  137. // check for a rewind
  138. if (is_int($i)) {
  139. // possibility: disable rewinding if the current token has a
  140. // rewind set on it already. This would offer protection from
  141. // infinite loop, but might hinder some advanced rewinding.
  142. $rewind_offset = $this->injectors[$i]->getRewindOffset();
  143. if (is_int($rewind_offset)) {
  144. for ($j = 0; $j < $rewind_offset; $j++) {
  145. if (empty($zipper->front)) break;
  146. $token = $zipper->prev($token);
  147. // indicate that other injectors should not process this token,
  148. // but we need to reprocess it
  149. unset($token->skip[$i]);
  150. $token->rewind = $i;
  151. if ($token instanceof HTMLPurifier_Token_Start) {
  152. array_pop($this->stack);
  153. } elseif ($token instanceof HTMLPurifier_Token_End) {
  154. $this->stack[] = $token->start;
  155. }
  156. }
  157. }
  158. $i = false;
  159. }
  160. // handle case of document end
  161. if ($token === NULL) {
  162. // kill processing if stack is empty
  163. if (empty($this->stack)) {
  164. break;
  165. }
  166. // peek
  167. $top_nesting = array_pop($this->stack);
  168. $this->stack[] = $top_nesting;
  169. // send error [TagClosedSuppress]
  170. if ($e && !isset($top_nesting->armor['MakeWellFormed_TagClosedError'])) {
  171. $e->send(E_NOTICE, 'Strategy_MakeWellFormed: Tag closed by document end', $top_nesting);
  172. }
  173. // append, don't splice, since this is the end
  174. $token = new HTMLPurifier_Token_End($top_nesting->name);
  175. // punt!
  176. $reprocess = true;
  177. continue;
  178. }
  179. //echo '<br>'; printZipper($zipper, $token);//printTokens($this->stack);
  180. //flush();
  181. // quick-check: if it's not a tag, no need to process
  182. if (empty($token->is_tag)) {
  183. if ($token instanceof HTMLPurifier_Token_Text) {
  184. foreach ($this->injectors as $i => $injector) {
  185. if (isset($token->skip[$i])) {
  186. continue;
  187. }
  188. if ($token->rewind !== null && $token->rewind !== $i) {
  189. continue;
  190. }
  191. // XXX fuckup
  192. $r = $token;
  193. $injector->handleText($r);
  194. $token = $this->processToken($r, $i);
  195. $reprocess = true;
  196. break;
  197. }
  198. }
  199. // another possibility is a comment
  200. continue;
  201. }
  202. if (isset($definition->info[$token->name])) {
  203. $type = $definition->info[$token->name]->child->type;
  204. } else {
  205. $type = false; // Type is unknown, treat accordingly
  206. }
  207. // quick tag checks: anything that's *not* an end tag
  208. $ok = false;
  209. if ($type === 'empty' && $token instanceof HTMLPurifier_Token_Start) {
  210. // claims to be a start tag but is empty
  211. $token = new HTMLPurifier_Token_Empty(
  212. $token->name,
  213. $token->attr,
  214. $token->line,
  215. $token->col,
  216. $token->armor
  217. );
  218. $ok = true;
  219. } elseif ($type && $type !== 'empty' && $token instanceof HTMLPurifier_Token_Empty) {
  220. // claims to be empty but really is a start tag
  221. // NB: this assignment is required
  222. $old_token = $token;
  223. $token = new HTMLPurifier_Token_End($token->name);
  224. $token = $this->insertBefore(
  225. new HTMLPurifier_Token_Start($old_token->name, $old_token->attr, $old_token->line, $old_token->col, $old_token->armor)
  226. );
  227. // punt (since we had to modify the input stream in a non-trivial way)
  228. $reprocess = true;
  229. continue;
  230. } elseif ($token instanceof HTMLPurifier_Token_Empty) {
  231. // real empty token
  232. $ok = true;
  233. } elseif ($token instanceof HTMLPurifier_Token_Start) {
  234. // start tag
  235. // ...unless they also have to close their parent
  236. if (!empty($this->stack)) {
  237. // Performance note: you might think that it's rather
  238. // inefficient, recalculating the autoclose information
  239. // for every tag that a token closes (since when we
  240. // do an autoclose, we push a new token into the
  241. // stream and then /process/ that, before
  242. // re-processing this token.) But this is
  243. // necessary, because an injector can make an
  244. // arbitrary transformations to the autoclosing
  245. // tokens we introduce, so things may have changed
  246. // in the meantime. Also, doing the inefficient thing is
  247. // "easy" to reason about (for certain perverse definitions
  248. // of "easy")
  249. $parent = array_pop($this->stack);
  250. $this->stack[] = $parent;
  251. $parent_def = null;
  252. $parent_elements = null;
  253. $autoclose = false;
  254. if (isset($definition->info[$parent->name])) {
  255. $parent_def = $definition->info[$parent->name];
  256. $parent_elements = $parent_def->child->getAllowedElements($config);
  257. $autoclose = !isset($parent_elements[$token->name]);
  258. }
  259. if ($autoclose && $definition->info[$token->name]->wrap) {
  260. // Check if an element can be wrapped by another
  261. // element to make it valid in a context (for
  262. // example, <ul><ul> needs a <li> in between)
  263. $wrapname = $definition->info[$token->name]->wrap;
  264. $wrapdef = $definition->info[$wrapname];
  265. $elements = $wrapdef->child->getAllowedElements($config);
  266. if (isset($elements[$token->name]) && isset($parent_elements[$wrapname])) {
  267. $newtoken = new HTMLPurifier_Token_Start($wrapname);
  268. $token = $this->insertBefore($newtoken);
  269. $reprocess = true;
  270. continue;
  271. }
  272. }
  273. $carryover = false;
  274. if ($autoclose && $parent_def->formatting) {
  275. $carryover = true;
  276. }
  277. if ($autoclose) {
  278. // check if this autoclose is doomed to fail
  279. // (this rechecks $parent, which his harmless)
  280. $autoclose_ok = isset($global_parent_allowed_elements[$token->name]);
  281. if (!$autoclose_ok) {
  282. foreach ($this->stack as $ancestor) {
  283. $elements = $definition->info[$ancestor->name]->child->getAllowedElements($config);
  284. if (isset($elements[$token->name])) {
  285. $autoclose_ok = true;
  286. break;
  287. }
  288. if ($definition->info[$token->name]->wrap) {
  289. $wrapname = $definition->info[$token->name]->wrap;
  290. $wrapdef = $definition->info[$wrapname];
  291. $wrap_elements = $wrapdef->child->getAllowedElements($config);
  292. if (isset($wrap_elements[$token->name]) && isset($elements[$wrapname])) {
  293. $autoclose_ok = true;
  294. break;
  295. }
  296. }
  297. }
  298. }
  299. if ($autoclose_ok) {
  300. // errors need to be updated
  301. $new_token = new HTMLPurifier_Token_End($parent->name);
  302. $new_token->start = $parent;
  303. // [TagClosedSuppress]
  304. if ($e && !isset($parent->armor['MakeWellFormed_TagClosedError'])) {
  305. if (!$carryover) {
  306. $e->send(E_NOTICE, 'Strategy_MakeWellFormed: Tag auto closed', $parent);
  307. } else {
  308. $e->send(E_NOTICE, 'Strategy_MakeWellFormed: Tag carryover', $parent);
  309. }
  310. }
  311. if ($carryover) {
  312. $element = clone $parent;
  313. // [TagClosedAuto]
  314. $element->armor['MakeWellFormed_TagClosedError'] = true;
  315. $element->carryover = true;
  316. $token = $this->processToken(array($new_token, $token, $element));
  317. } else {
  318. $token = $this->insertBefore($new_token);
  319. }
  320. } else {
  321. $token = $this->remove();
  322. }
  323. $reprocess = true;
  324. continue;
  325. }
  326. }
  327. $ok = true;
  328. }
  329. if ($ok) {
  330. foreach ($this->injectors as $i => $injector) {
  331. if (isset($token->skip[$i])) {
  332. continue;
  333. }
  334. if ($token->rewind !== null && $token->rewind !== $i) {
  335. continue;
  336. }
  337. $r = $token;
  338. $injector->handleElement($r);
  339. $token = $this->processToken($r, $i);
  340. $reprocess = true;
  341. break;
  342. }
  343. if (!$reprocess) {
  344. // ah, nothing interesting happened; do normal processing
  345. if ($token instanceof HTMLPurifier_Token_Start) {
  346. $this->stack[] = $token;
  347. } elseif ($token instanceof HTMLPurifier_Token_End) {
  348. throw new HTMLPurifier_Exception(
  349. 'Improper handling of end tag in start code; possible error in MakeWellFormed'
  350. );
  351. }
  352. }
  353. continue;
  354. }
  355. // sanity check: we should be dealing with a closing tag
  356. if (!$token instanceof HTMLPurifier_Token_End) {
  357. throw new HTMLPurifier_Exception('Unaccounted for tag token in input stream, bug in HTML Purifier');
  358. }
  359. // make sure that we have something open
  360. if (empty($this->stack)) {
  361. if ($escape_invalid_tags) {
  362. if ($e) {
  363. $e->send(E_WARNING, 'Strategy_MakeWellFormed: Unnecessary end tag to text');
  364. }
  365. $token = new HTMLPurifier_Token_Text($generator->generateFromToken($token));
  366. } else {
  367. if ($e) {
  368. $e->send(E_WARNING, 'Strategy_MakeWellFormed: Unnecessary end tag removed');
  369. }
  370. $token = $this->remove();
  371. }
  372. $reprocess = true;
  373. continue;
  374. }
  375. // first, check for the simplest case: everything closes neatly.
  376. // Eventually, everything passes through here; if there are problems
  377. // we modify the input stream accordingly and then punt, so that
  378. // the tokens get processed again.
  379. $current_parent = array_pop($this->stack);
  380. if ($current_parent->name == $token->name) {
  381. $token->start = $current_parent;
  382. foreach ($this->injectors as $i => $injector) {
  383. if (isset($token->skip[$i])) {
  384. continue;
  385. }
  386. if ($token->rewind !== null && $token->rewind !== $i) {
  387. continue;
  388. }
  389. $r = $token;
  390. $injector->handleEnd($r);
  391. $token = $this->processToken($r, $i);
  392. $this->stack[] = $current_parent;
  393. $reprocess = true;
  394. break;
  395. }
  396. continue;
  397. }
  398. // okay, so we're trying to close the wrong tag
  399. // undo the pop previous pop
  400. $this->stack[] = $current_parent;
  401. // scroll back the entire nest, trying to find our tag.
  402. // (feature could be to specify how far you'd like to go)
  403. $size = count($this->stack);
  404. // -2 because -1 is the last element, but we already checked that
  405. $skipped_tags = false;
  406. for ($j = $size - 2; $j >= 0; $j--) {
  407. if ($this->stack[$j]->name == $token->name) {
  408. $skipped_tags = array_slice($this->stack, $j);
  409. break;
  410. }
  411. }
  412. // we didn't find the tag, so remove
  413. if ($skipped_tags === false) {
  414. if ($escape_invalid_tags) {
  415. if ($e) {
  416. $e->send(E_WARNING, 'Strategy_MakeWellFormed: Stray end tag to text');
  417. }
  418. $token = new HTMLPurifier_Token_Text($generator->generateFromToken($token));
  419. } else {
  420. if ($e) {
  421. $e->send(E_WARNING, 'Strategy_MakeWellFormed: Stray end tag removed');
  422. }
  423. $token = $this->remove();
  424. }
  425. $reprocess = true;
  426. continue;
  427. }
  428. // do errors, in REVERSE $j order: a,b,c with </a></b></c>
  429. $c = count($skipped_tags);
  430. if ($e) {
  431. for ($j = $c - 1; $j > 0; $j--) {
  432. // notice we exclude $j == 0, i.e. the current ending tag, from
  433. // the errors... [TagClosedSuppress]
  434. if (!isset($skipped_tags[$j]->armor['MakeWellFormed_TagClosedError'])) {
  435. $e->send(E_NOTICE, 'Strategy_MakeWellFormed: Tag closed by element end', $skipped_tags[$j]);
  436. }
  437. }
  438. }
  439. // insert tags, in FORWARD $j order: c,b,a with </a></b></c>
  440. $replace = array($token);
  441. for ($j = 1; $j < $c; $j++) {
  442. // ...as well as from the insertions
  443. $new_token = new HTMLPurifier_Token_End($skipped_tags[$j]->name);
  444. $new_token->start = $skipped_tags[$j];
  445. array_unshift($replace, $new_token);
  446. if (isset($definition->info[$new_token->name]) && $definition->info[$new_token->name]->formatting) {
  447. // [TagClosedAuto]
  448. $element = clone $skipped_tags[$j];
  449. $element->carryover = true;
  450. $element->armor['MakeWellFormed_TagClosedError'] = true;
  451. $replace[] = $element;
  452. }
  453. }
  454. $token = $this->processToken($replace);
  455. $reprocess = true;
  456. continue;
  457. }
  458. $context->destroy('CurrentToken');
  459. $context->destroy('CurrentNesting');
  460. $context->destroy('InputZipper');
  461. unset($this->injectors, $this->stack, $this->tokens);
  462. return $zipper->toArray($token);
  463. }
  464. /**
  465. * Processes arbitrary token values for complicated substitution patterns.
  466. * In general:
  467. *
  468. * If $token is an array, it is a list of tokens to substitute for the
  469. * current token. These tokens then get individually processed. If there
  470. * is a leading integer in the list, that integer determines how many
  471. * tokens from the stream should be removed.
  472. *
  473. * If $token is a regular token, it is swapped with the current token.
  474. *
  475. * If $token is false, the current token is deleted.
  476. *
  477. * If $token is an integer, that number of tokens (with the first token
  478. * being the current one) will be deleted.
  479. *
  480. * @param HTMLPurifier_Token|array|int|bool $token Token substitution value
  481. * @param HTMLPurifier_Injector|int $injector Injector that performed the substitution; default is if
  482. * this is not an injector related operation.
  483. * @throws HTMLPurifier_Exception
  484. */
  485. protected function processToken($token, $injector = -1)
  486. {
  487. // normalize forms of token
  488. if (is_object($token)) {
  489. $token = array(1, $token);
  490. }
  491. if (is_int($token)) {
  492. $token = array($token);
  493. }
  494. if ($token === false) {
  495. $token = array(1);
  496. }
  497. if (!is_array($token)) {
  498. throw new HTMLPurifier_Exception('Invalid token type from injector');
  499. }
  500. if (!is_int($token[0])) {
  501. array_unshift($token, 1);
  502. }
  503. if ($token[0] === 0) {
  504. throw new HTMLPurifier_Exception('Deleting zero tokens is not valid');
  505. }
  506. // $token is now an array with the following form:
  507. // array(number nodes to delete, new node 1, new node 2, ...)
  508. $delete = array_shift($token);
  509. list($old, $r) = $this->zipper->splice($this->token, $delete, $token);
  510. if ($injector > -1) {
  511. // determine appropriate skips
  512. $oldskip = isset($old[0]) ? $old[0]->skip : array();
  513. foreach ($token as $object) {
  514. $object->skip = $oldskip;
  515. $object->skip[$injector] = true;
  516. }
  517. }
  518. return $r;
  519. }
  520. /**
  521. * Inserts a token before the current token. Cursor now points to
  522. * this token. You must reprocess after this.
  523. * @param HTMLPurifier_Token $token
  524. */
  525. private function insertBefore($token)
  526. {
  527. // NB not $this->zipper->insertBefore(), due to positioning
  528. // differences
  529. $splice = $this->zipper->splice($this->token, 0, array($token));
  530. return $splice[1];
  531. }
  532. /**
  533. * Removes current token. Cursor now points to new token occupying previously
  534. * occupied space. You must reprocess after this.
  535. */
  536. private function remove()
  537. {
  538. return $this->zipper->delete();
  539. }
  540. }
  541. // vim: et sw=4 sts=4