You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

EntityParser.php 4.9KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153
  1. <?php
  2. // if want to implement error collecting here, we'll need to use some sort
  3. // of global data (probably trigger_error) because it's impossible to pass
  4. // $config or $context to the callback functions.
  5. /**
  6. * Handles referencing and derefencing character entities
  7. */
  8. class HTMLPurifier_EntityParser
  9. {
  10. /**
  11. * Reference to entity lookup table.
  12. * @type HTMLPurifier_EntityLookup
  13. */
  14. protected $_entity_lookup;
  15. /**
  16. * Callback regex string for parsing entities.
  17. * @type string
  18. */
  19. protected $_substituteEntitiesRegex =
  20. '/&(?:[#]x([a-fA-F0-9]+)|[#]0*(\d+)|([A-Za-z_:][A-Za-z0-9.\-_:]*));?/';
  21. // 1. hex 2. dec 3. string (XML style)
  22. /**
  23. * Decimal to parsed string conversion table for special entities.
  24. * @type array
  25. */
  26. protected $_special_dec2str =
  27. array(
  28. 34 => '"',
  29. 38 => '&',
  30. 39 => "'",
  31. 60 => '<',
  32. 62 => '>'
  33. );
  34. /**
  35. * Stripped entity names to decimal conversion table for special entities.
  36. * @type array
  37. */
  38. protected $_special_ent2dec =
  39. array(
  40. 'quot' => 34,
  41. 'amp' => 38,
  42. 'lt' => 60,
  43. 'gt' => 62
  44. );
  45. /**
  46. * Substitutes non-special entities with their parsed equivalents. Since
  47. * running this whenever you have parsed character is t3h 5uck, we run
  48. * it before everything else.
  49. *
  50. * @param string $string String to have non-special entities parsed.
  51. * @return string Parsed string.
  52. */
  53. public function substituteNonSpecialEntities($string)
  54. {
  55. // it will try to detect missing semicolons, but don't rely on it
  56. return preg_replace_callback(
  57. $this->_substituteEntitiesRegex,
  58. array($this, 'nonSpecialEntityCallback'),
  59. $string
  60. );
  61. }
  62. /**
  63. * Callback function for substituteNonSpecialEntities() that does the work.
  64. *
  65. * @param array $matches PCRE matches array, with 0 the entire match, and
  66. * either index 1, 2 or 3 set with a hex value, dec value,
  67. * or string (respectively).
  68. * @return string Replacement string.
  69. */
  70. protected function nonSpecialEntityCallback($matches)
  71. {
  72. // replaces all but big five
  73. $entity = $matches[0];
  74. $is_num = (@$matches[0][1] === '#');
  75. if ($is_num) {
  76. $is_hex = (@$entity[2] === 'x');
  77. $code = $is_hex ? hexdec($matches[1]) : (int) $matches[2];
  78. // abort for special characters
  79. if (isset($this->_special_dec2str[$code])) {
  80. return $entity;
  81. }
  82. return HTMLPurifier_Encoder::unichr($code);
  83. } else {
  84. if (isset($this->_special_ent2dec[$matches[3]])) {
  85. return $entity;
  86. }
  87. if (!$this->_entity_lookup) {
  88. $this->_entity_lookup = HTMLPurifier_EntityLookup::instance();
  89. }
  90. if (isset($this->_entity_lookup->table[$matches[3]])) {
  91. return $this->_entity_lookup->table[$matches[3]];
  92. } else {
  93. return $entity;
  94. }
  95. }
  96. }
  97. /**
  98. * Substitutes only special entities with their parsed equivalents.
  99. *
  100. * @notice We try to avoid calling this function because otherwise, it
  101. * would have to be called a lot (for every parsed section).
  102. *
  103. * @param string $string String to have non-special entities parsed.
  104. * @return string Parsed string.
  105. */
  106. public function substituteSpecialEntities($string)
  107. {
  108. return preg_replace_callback(
  109. $this->_substituteEntitiesRegex,
  110. array($this, 'specialEntityCallback'),
  111. $string
  112. );
  113. }
  114. /**
  115. * Callback function for substituteSpecialEntities() that does the work.
  116. *
  117. * This callback has same syntax as nonSpecialEntityCallback().
  118. *
  119. * @param array $matches PCRE-style matches array, with 0 the entire match, and
  120. * either index 1, 2 or 3 set with a hex value, dec value,
  121. * or string (respectively).
  122. * @return string Replacement string.
  123. */
  124. protected function specialEntityCallback($matches)
  125. {
  126. $entity = $matches[0];
  127. $is_num = (@$matches[0][1] === '#');
  128. if ($is_num) {
  129. $is_hex = (@$entity[2] === 'x');
  130. $int = $is_hex ? hexdec($matches[1]) : (int) $matches[2];
  131. return isset($this->_special_dec2str[$int]) ?
  132. $this->_special_dec2str[$int] :
  133. $entity;
  134. } else {
  135. return isset($this->_special_ent2dec[$matches[3]]) ?
  136. $this->_special_ent2dec[$matches[3]] :
  137. $entity;
  138. }
  139. }
  140. }
  141. // vim: et sw=4 sts=4