You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

753 line
19KB

  1. <?php
  2. /**
  3. * Sequence matcher for Diff
  4. *
  5. * PHP version 5
  6. *
  7. * Copyright (c) 2009 Chris Boulton <chris.boulton@interspire.com>
  8. *
  9. * All rights reserved.
  10. *
  11. * Redistribution and use in source and binary forms, with or without
  12. * modification, are permitted provided that the following conditions are met:
  13. *
  14. * - Redistributions of source code must retain the above copyright notice,
  15. * this list of conditions and the following disclaimer.
  16. * - Redistributions in binary form must reproduce the above copyright notice,
  17. * this list of conditions and the following disclaimer in the documentation
  18. * and/or other materials provided with the distribution.
  19. * - Neither the name of the Chris Boulton nor the names of its contributors
  20. * may be used to endorse or promote products derived from this software
  21. * without specific prior written permission.
  22. *
  23. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  24. * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  25. * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  26. * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  27. * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  28. * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  29. * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  30. * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  31. * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  32. * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  33. * POSSIBILITY OF SUCH DAMAGE.
  34. *
  35. * @package Diff
  36. * @author Chris Boulton <chris.boulton@interspire.com>
  37. * @copyright (c) 2009 Chris Boulton
  38. * @license New BSD License http://www.opensource.org/licenses/bsd-license.php
  39. * @version 1.1
  40. * @link http://github.com/chrisboulton/php-diff
  41. */
  42. class Diff_SequenceMatcher
  43. {
  44. /**
  45. * @var string|array Either a string or an array containing a callback function to determine if a line is "junk" or not.
  46. */
  47. private $junkCallback = null;
  48. /**
  49. * @var array The first sequence to compare against.
  50. */
  51. private $a = null;
  52. /**
  53. * @var array The second sequence.
  54. */
  55. private $b = null;
  56. /**
  57. * @var array Array of characters that are considered junk from the second sequence. Characters are the array key.
  58. */
  59. private $junkDict = array();
  60. /**
  61. * @var array Array of indices that do not contain junk elements.
  62. */
  63. private $b2j = array();
  64. private $options = array();
  65. private $matchingBlocks = null;
  66. private $opCodes = null;
  67. private $fullBCount = null;
  68. private $defaultOptions = array(
  69. 'ignoreNewLines' => false,
  70. 'ignoreWhitespace' => false,
  71. 'ignoreCase' => false
  72. );
  73. /**
  74. * The constructor. With the sequences being passed, they'll be set for the
  75. * sequence matcher and it will perform a basic cleanup & calculate junk
  76. * elements.
  77. *
  78. * @param string|array $a A string or array containing the lines to compare against.
  79. * @param string|array $b A string or array containing the lines to compare.
  80. * @param string|array $junkCallback Either an array or string that references a callback function (if there is one) to determine 'junk' characters.
  81. * @param array $options
  82. */
  83. public function __construct($a, $b, $junkCallback=null, $options)
  84. {
  85. $this->a = null;
  86. $this->b = null;
  87. $this->junkCallback = $junkCallback;
  88. $this->setOptions($options);
  89. $this->setSequences($a, $b);
  90. }
  91. /**
  92. * Set new options
  93. *
  94. * @param array $options
  95. */
  96. public function setOptions($options)
  97. {
  98. $this->options = array_merge($this->defaultOptions, $options);
  99. }
  100. /**
  101. * Set the first and second sequences to use with the sequence matcher.
  102. *
  103. * @param string|array $a A string or array containing the lines to compare against.
  104. * @param string|array $b A string or array containing the lines to compare.
  105. */
  106. public function setSequences($a, $b)
  107. {
  108. $this->setSeq1($a);
  109. $this->setSeq2($b);
  110. }
  111. /**
  112. * Set the first sequence ($a) and reset any internal caches to indicate that
  113. * when calling the calculation methods, we need to recalculate them.
  114. *
  115. * @param string|array $a The sequence to set as the first sequence.
  116. */
  117. public function setSeq1($a)
  118. {
  119. if(!is_array($a)) {
  120. $a = str_split($a);
  121. }
  122. if($a == $this->a) {
  123. return;
  124. }
  125. $this->a= $a;
  126. $this->matchingBlocks = null;
  127. $this->opCodes = null;
  128. }
  129. /**
  130. * Set the second sequence ($b) and reset any internal caches to indicate that
  131. * when calling the calculation methods, we need to recalculate them.
  132. *
  133. * @param string|array $b The sequence to set as the second sequence.
  134. */
  135. public function setSeq2($b)
  136. {
  137. if(!is_array($b)) {
  138. $b = str_split($b);
  139. }
  140. if($b == $this->b) {
  141. return;
  142. }
  143. $this->b = $b;
  144. $this->matchingBlocks = null;
  145. $this->opCodes = null;
  146. $this->fullBCount = null;
  147. $this->chainB();
  148. }
  149. /**
  150. * Generate the internal arrays containing the list of junk and non-junk
  151. * characters for the second ($b) sequence.
  152. */
  153. private function chainB()
  154. {
  155. $length = count ($this->b);
  156. $this->b2j = array();
  157. $popularDict = array();
  158. for($i = 0; $i < $length; ++$i) {
  159. $char = $this->b[$i];
  160. if(isset($this->b2j[$char])) {
  161. if($length >= 200 && count($this->b2j[$char]) * 100 > $length) {
  162. $popularDict[$char] = 1;
  163. unset($this->b2j[$char]);
  164. }
  165. else {
  166. $this->b2j[$char][] = $i;
  167. }
  168. }
  169. else {
  170. $this->b2j[$char] = array(
  171. $i
  172. );
  173. }
  174. }
  175. // Remove leftovers
  176. foreach(array_keys($popularDict) as $char) {
  177. unset($this->b2j[$char]);
  178. }
  179. $this->junkDict = array();
  180. if(is_callable($this->junkCallback)) {
  181. foreach(array_keys($popularDict) as $char) {
  182. if(call_user_func($this->junkCallback, $char)) {
  183. $this->junkDict[$char] = 1;
  184. unset($popularDict[$char]);
  185. }
  186. }
  187. foreach(array_keys($this->b2j) as $char) {
  188. if(call_user_func($this->junkCallback, $char)) {
  189. $this->junkDict[$char] = 1;
  190. unset($this->b2j[$char]);
  191. }
  192. }
  193. }
  194. }
  195. /**
  196. * Checks if a particular character is in the junk dictionary
  197. * for the list of junk characters.
  198. * @param $b
  199. * @return boolean True if the character is considered junk. False if not.
  200. */
  201. private function isBJunk($b)
  202. {
  203. if(isset($this->junkDict[$b])) {
  204. return true;
  205. }
  206. return false;
  207. }
  208. /**
  209. * Find the longest matching block in the two sequences, as defined by the
  210. * lower and upper constraints for each sequence. (for the first sequence,
  211. * $alo - $ahi and for the second sequence, $blo - $bhi)
  212. *
  213. * Essentially, of all of the maximal matching blocks, return the one that
  214. * startest earliest in $a, and all of those maximal matching blocks that
  215. * start earliest in $a, return the one that starts earliest in $b.
  216. *
  217. * If the junk callback is defined, do the above but with the restriction
  218. * that the junk element appears in the block. Extend it as far as possible
  219. * by matching only junk elements in both $a and $b.
  220. *
  221. * @param int $alo The lower constraint for the first sequence.
  222. * @param int $ahi The upper constraint for the first sequence.
  223. * @param int $blo The lower constraint for the second sequence.
  224. * @param int $bhi The upper constraint for the second sequence.
  225. * @return array Array containing the longest match that includes the starting position in $a, start in $b and the length/size.
  226. */
  227. public function findLongestMatch($alo, $ahi, $blo, $bhi)
  228. {
  229. $a = $this->a;
  230. $b = $this->b;
  231. $bestI = $alo;
  232. $bestJ = $blo;
  233. $bestSize = 0;
  234. $j2Len = array();
  235. $nothing = array();
  236. for($i = $alo; $i < $ahi; ++$i) {
  237. $newJ2Len = array();
  238. $jDict = $this->arrayGetDefault($this->b2j, $a[$i], $nothing);
  239. foreach($jDict as $j) {
  240. if($j < $blo) {
  241. continue;
  242. }
  243. else if($j >= $bhi) {
  244. break;
  245. }
  246. $k = $this->arrayGetDefault($j2Len, $j -1, 0) + 1;
  247. $newJ2Len[$j] = $k;
  248. if($k > $bestSize) {
  249. $bestI = $i - $k + 1;
  250. $bestJ = $j - $k + 1;
  251. $bestSize = $k;
  252. }
  253. }
  254. $j2Len = $newJ2Len;
  255. }
  256. while($bestI > $alo && $bestJ > $blo && !$this->isBJunk($b[$bestJ - 1]) &&
  257. !$this->linesAreDifferent($bestI - 1, $bestJ - 1)) {
  258. --$bestI;
  259. --$bestJ;
  260. ++$bestSize;
  261. }
  262. while($bestI + $bestSize < $ahi && ($bestJ + $bestSize) < $bhi &&
  263. !$this->isBJunk($b[$bestJ + $bestSize]) && !$this->linesAreDifferent($bestI + $bestSize, $bestJ + $bestSize)) {
  264. ++$bestSize;
  265. }
  266. while($bestI > $alo && $bestJ > $blo && $this->isBJunk($b[$bestJ - 1]) &&
  267. !$this->linesAreDifferent($bestI - 1, $bestJ - 1)) {
  268. --$bestI;
  269. --$bestJ;
  270. ++$bestSize;
  271. }
  272. while($bestI + $bestSize < $ahi && $bestJ + $bestSize < $bhi &&
  273. $this->isBJunk($b[$bestJ + $bestSize]) && !$this->linesAreDifferent($bestI + $bestSize, $bestJ + $bestSize)) {
  274. ++$bestSize;
  275. }
  276. return array(
  277. $bestI,
  278. $bestJ,
  279. $bestSize
  280. );
  281. }
  282. /**
  283. * Check if the two lines at the given indexes are different or not.
  284. *
  285. * @param int $aIndex Line number to check against in a.
  286. * @param int $bIndex Line number to check against in b.
  287. * @return boolean True if the lines are different and false if not.
  288. */
  289. public function linesAreDifferent($aIndex, $bIndex)
  290. {
  291. $lineA = $this->a[$aIndex];
  292. $lineB = $this->b[$bIndex];
  293. if($this->options['ignoreWhitespace']) {
  294. $replace = array("\t", ' ');
  295. $lineA = str_replace($replace, '', $lineA);
  296. $lineB = str_replace($replace, '', $lineB);
  297. }
  298. if($this->options['ignoreCase']) {
  299. $lineA = strtolower($lineA);
  300. $lineB = strtolower($lineB);
  301. }
  302. if($lineA != $lineB) {
  303. return true;
  304. }
  305. return false;
  306. }
  307. /**
  308. * Return a nested set of arrays for all of the matching sub-sequences
  309. * in the strings $a and $b.
  310. *
  311. * Each block contains the lower constraint of the block in $a, the lower
  312. * constraint of the block in $b and finally the number of lines that the
  313. * block continues for.
  314. *
  315. * @return array Nested array of the matching blocks, as described by the function.
  316. */
  317. public function getMatchingBlocks()
  318. {
  319. if(!empty($this->matchingBlocks)) {
  320. return $this->matchingBlocks;
  321. }
  322. $aLength = count($this->a);
  323. $bLength = count($this->b);
  324. $queue = array(
  325. array(
  326. 0,
  327. $aLength,
  328. 0,
  329. $bLength
  330. )
  331. );
  332. $matchingBlocks = array();
  333. while(!empty($queue)) {
  334. list($alo, $ahi, $blo, $bhi) = array_pop($queue);
  335. $x = $this->findLongestMatch($alo, $ahi, $blo, $bhi);
  336. list($i, $j, $k) = $x;
  337. if($k) {
  338. $matchingBlocks[] = $x;
  339. if($alo < $i && $blo < $j) {
  340. $queue[] = array(
  341. $alo,
  342. $i,
  343. $blo,
  344. $j
  345. );
  346. }
  347. if($i + $k < $ahi && $j + $k < $bhi) {
  348. $queue[] = array(
  349. $i + $k,
  350. $ahi,
  351. $j + $k,
  352. $bhi
  353. );
  354. }
  355. }
  356. }
  357. usort($matchingBlocks, array($this, 'tupleSort'));
  358. $i1 = 0;
  359. $j1 = 0;
  360. $k1 = 0;
  361. $nonAdjacent = array();
  362. foreach($matchingBlocks as $block) {
  363. list($i2, $j2, $k2) = $block;
  364. if($i1 + $k1 == $i2 && $j1 + $k1 == $j2) {
  365. $k1 += $k2;
  366. }
  367. else {
  368. if($k1) {
  369. $nonAdjacent[] = array(
  370. $i1,
  371. $j1,
  372. $k1
  373. );
  374. }
  375. $i1 = $i2;
  376. $j1 = $j2;
  377. $k1 = $k2;
  378. }
  379. }
  380. if($k1) {
  381. $nonAdjacent[] = array(
  382. $i1,
  383. $j1,
  384. $k1
  385. );
  386. }
  387. $nonAdjacent[] = array(
  388. $aLength,
  389. $bLength,
  390. 0
  391. );
  392. $this->matchingBlocks = $nonAdjacent;
  393. return $this->matchingBlocks;
  394. }
  395. /**
  396. * Return a list of all of the opcodes for the differences between the
  397. * two strings.
  398. *
  399. * The nested array returned contains an array describing the opcode
  400. * which includes:
  401. * 0 - The type of tag (as described below) for the opcode.
  402. * 1 - The beginning line in the first sequence.
  403. * 2 - The end line in the first sequence.
  404. * 3 - The beginning line in the second sequence.
  405. * 4 - The end line in the second sequence.
  406. *
  407. * The different types of tags include:
  408. * replace - The string from $i1 to $i2 in $a should be replaced by
  409. * the string in $b from $j1 to $j2.
  410. * delete - The string in $a from $i1 to $j2 should be deleted.
  411. * insert - The string in $b from $j1 to $j2 should be inserted at
  412. * $i1 in $a.
  413. * equal - The two strings with the specified ranges are equal.
  414. *
  415. * @return array Array of the opcodes describing the differences between the strings.
  416. */
  417. public function getOpCodes()
  418. {
  419. if(!empty($this->opCodes)) {
  420. return $this->opCodes;
  421. }
  422. $i = 0;
  423. $j = 0;
  424. $this->opCodes = array();
  425. $blocks = $this->getMatchingBlocks();
  426. foreach($blocks as $block) {
  427. list($ai, $bj, $size) = $block;
  428. $tag = '';
  429. if($i < $ai && $j < $bj) {
  430. $tag = 'replace';
  431. }
  432. else if($i < $ai) {
  433. $tag = 'delete';
  434. }
  435. else if($j < $bj) {
  436. $tag = 'insert';
  437. }
  438. if($tag) {
  439. $this->opCodes[] = array(
  440. $tag,
  441. $i,
  442. $ai,
  443. $j,
  444. $bj
  445. );
  446. }
  447. $i = $ai + $size;
  448. $j = $bj + $size;
  449. if($size) {
  450. $this->opCodes[] = array(
  451. 'equal',
  452. $ai,
  453. $i,
  454. $bj,
  455. $j
  456. );
  457. }
  458. }
  459. return $this->opCodes;
  460. }
  461. /**
  462. * Return a series of nested arrays containing different groups of generated
  463. * opcodes for the differences between the strings with up to $context lines
  464. * of surrounding content.
  465. *
  466. * Essentially what happens here is any big equal blocks of strings are stripped
  467. * out, the smaller subsets of changes are then arranged in to their groups.
  468. * This means that the sequence matcher and diffs do not need to include the full
  469. * content of the different files but can still provide context as to where the
  470. * changes are.
  471. *
  472. * @param int $context The number of lines of context to provide around the groups.
  473. * @return array Nested array of all of the grouped opcodes.
  474. */
  475. public function getGroupedOpcodes($context=3)
  476. {
  477. $opCodes = $this->getOpCodes();
  478. if(empty($opCodes)) {
  479. $opCodes = array(
  480. array(
  481. 'equal',
  482. 0,
  483. 1,
  484. 0,
  485. 1
  486. )
  487. );
  488. }
  489. if($opCodes[0][0] == 'equal') {
  490. $opCodes[0] = array(
  491. $opCodes[0][0],
  492. max($opCodes[0][1], $opCodes[0][2] - $context),
  493. $opCodes[0][2],
  494. max($opCodes[0][3], $opCodes[0][4] - $context),
  495. $opCodes[0][4]
  496. );
  497. }
  498. $lastItem = count($opCodes) - 1;
  499. if($opCodes[$lastItem][0] == 'equal') {
  500. list($tag, $i1, $i2, $j1, $j2) = $opCodes[$lastItem];
  501. $opCodes[$lastItem] = array(
  502. $tag,
  503. $i1,
  504. min($i2, $i1 + $context),
  505. $j1,
  506. min($j2, $j1 + $context)
  507. );
  508. }
  509. $maxRange = $context * 2;
  510. $groups = array();
  511. $group = array();
  512. foreach($opCodes as $code) {
  513. list($tag, $i1, $i2, $j1, $j2) = $code;
  514. if($tag == 'equal' && $i2 - $i1 > $maxRange) {
  515. $group[] = array(
  516. $tag,
  517. $i1,
  518. min($i2, $i1 + $context),
  519. $j1,
  520. min($j2, $j1 + $context)
  521. );
  522. $groups[] = $group;
  523. $group = array();
  524. $i1 = max($i1, $i2 - $context);
  525. $j1 = max($j1, $j2 - $context);
  526. }
  527. $group[] = array(
  528. $tag,
  529. $i1,
  530. $i2,
  531. $j1,
  532. $j2
  533. );
  534. }
  535. if(!empty($group) && !(count($group) == 1 && $group[0][0] == 'equal')) {
  536. $groups[] = $group;
  537. }
  538. return $groups;
  539. }
  540. /**
  541. * Return a measure of the similarity between the two sequences.
  542. * This will be a float value between 0 and 1.
  543. *
  544. * Out of all of the ratio calculation functions, this is the most
  545. * expensive to call if getMatchingBlocks or getOpCodes is yet to be
  546. * called. The other calculation methods (quickRatio and realquickRatio)
  547. * can be used to perform quicker calculations but may be less accurate.
  548. *
  549. * The ratio is calculated as (2 * number of matches) / total number of
  550. * elements in both sequences.
  551. *
  552. * @return float The calculated ratio.
  553. */
  554. public function Ratio()
  555. {
  556. $matches = array_reduce($this->getMatchingBlocks(), array($this, 'ratioReduce'), 0);
  557. return $this->calculateRatio($matches, count ($this->a) + count ($this->b));
  558. }
  559. /**
  560. * Helper function to calculate the number of matches for Ratio().
  561. *
  562. * @param int $sum The running total for the number of matches.
  563. * @param array $triple Array containing the matching block triple to add to the running total.
  564. * @return int The new running total for the number of matches.
  565. */
  566. private function ratioReduce($sum, $triple)
  567. {
  568. return $sum + ($triple[count($triple) - 1]);
  569. }
  570. /**
  571. * Quickly return an upper bound ratio for the similarity of the strings.
  572. * This is quicker to compute than Ratio().
  573. *
  574. * @return float The calculated ratio.
  575. */
  576. private function quickRatio()
  577. {
  578. if($this->fullBCount === null) {
  579. $this->fullBCount = array();
  580. $bLength = count ($this->b);
  581. for($i = 0; $i < $bLength; ++$i) {
  582. $char = $this->b[$i];
  583. $this->fullBCount[$char] = $this->arrayGetDefault($this->fullBCount, $char, 0) + 1;
  584. }
  585. }
  586. $avail = array();
  587. $matches = 0;
  588. $aLength = count ($this->a);
  589. for($i = 0; $i < $aLength; ++$i) {
  590. $char = $this->a[$i];
  591. if(isset($avail[$char])) {
  592. $numb = $avail[$char];
  593. }
  594. else {
  595. $numb = $this->arrayGetDefault($this->fullBCount, $char, 0);
  596. }
  597. $avail[$char] = $numb - 1;
  598. if($numb > 0) {
  599. ++$matches;
  600. }
  601. }
  602. $this->calculateRatio($matches, count ($this->a) + count ($this->b));
  603. }
  604. /**
  605. * Return an upper bound ratio really quickly for the similarity of the strings.
  606. * This is quicker to compute than Ratio() and quickRatio().
  607. *
  608. * @return float The calculated ratio.
  609. */
  610. private function realquickRatio()
  611. {
  612. $aLength = count ($this->a);
  613. $bLength = count ($this->b);
  614. return $this->calculateRatio(min($aLength, $bLength), $aLength + $bLength);
  615. }
  616. /**
  617. * Helper function for calculating the ratio to measure similarity for the strings.
  618. * The ratio is defined as being 2 * (number of matches / total length)
  619. *
  620. * @param int $matches The number of matches in the two strings.
  621. * @param int $length The length of the two strings.
  622. * @return float The calculated ratio.
  623. */
  624. private function calculateRatio($matches, $length=0)
  625. {
  626. if($length) {
  627. return 2 * ($matches / $length);
  628. }
  629. else {
  630. return 1;
  631. }
  632. }
  633. /**
  634. * Helper function that provides the ability to return the value for a key
  635. * in an array of it exists, or if it doesn't then return a default value.
  636. * Essentially cleaner than doing a series of if(isset()) {} else {} calls.
  637. *
  638. * @param array $array The array to search.
  639. * @param string $key The key to check that exists.
  640. * @param mixed $default The value to return as the default value if the key doesn't exist.
  641. * @return mixed The value from the array if the key exists or otherwise the default.
  642. */
  643. private function arrayGetDefault($array, $key, $default)
  644. {
  645. if(isset($array[$key])) {
  646. return $array[$key];
  647. }
  648. else {
  649. return $default;
  650. }
  651. }
  652. /**
  653. * Sort an array by the nested arrays it contains. Helper function for getMatchingBlocks
  654. *
  655. * @param array $a First array to compare.
  656. * @param array $b Second array to compare.
  657. * @return int -1, 0 or 1, as expected by the usort function.
  658. */
  659. private function tupleSort($a, $b)
  660. {
  661. $max = max(count($a), count($b));
  662. for($i = 0; $i < $max; ++$i) {
  663. if($a[$i] < $b[$i]) {
  664. return -1;
  665. }
  666. else if($a[$i] > $b[$i]) {
  667. return 1;
  668. }
  669. }
  670. if(count($a) == count($b)) {
  671. return 0;
  672. }
  673. else if(count($a) < count($b)) {
  674. return -1;
  675. }
  676. else {
  677. return 1;
  678. }
  679. }
  680. }