You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

748 lines
19KB

  1. <?php
  2. /**
  3. * Sequence matcher for Diff
  4. *
  5. * PHP version 5
  6. *
  7. * Copyright (c) 2009 Chris Boulton <chris.boulton@interspire.com>
  8. *
  9. * All rights reserved.
  10. *
  11. * Redistribution and use in source and binary forms, with or without
  12. * modification, are permitted provided that the following conditions are met:
  13. *
  14. * - Redistributions of source code must retain the above copyright notice,
  15. * this list of conditions and the following disclaimer.
  16. * - Redistributions in binary form must reproduce the above copyright notice,
  17. * this list of conditions and the following disclaimer in the documentation
  18. * and/or other materials provided with the distribution.
  19. * - Neither the name of the Chris Boulton nor the names of its contributors
  20. * may be used to endorse or promote products derived from this software
  21. * without specific prior written permission.
  22. *
  23. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  24. * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  25. * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  26. * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  27. * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  28. * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  29. * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  30. * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  31. * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  32. * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  33. * POSSIBILITY OF SUCH DAMAGE.
  34. *
  35. * @package Diff
  36. * @author Chris Boulton <chris.boulton@interspire.com>
  37. * @copyright (c) 2009 Chris Boulton
  38. * @license New BSD License http://www.opensource.org/licenses/bsd-license.php
  39. * @version 1.1
  40. * @link http://github.com/chrisboulton/php-diff
  41. */
  42. class Diff_SequenceMatcher
  43. {
  44. /**
  45. * @var string|array Either a string or an array containing a callback function to determine if a line is "junk" or not.
  46. */
  47. private $junkCallback = null;
  48. /**
  49. * @var array The first sequence to compare against.
  50. */
  51. private $a = null;
  52. /**
  53. * @var array The second sequence.
  54. */
  55. private $b = null;
  56. /**
  57. * @var array Array of characters that are considered junk from the second sequence. Characters are the array key.
  58. */
  59. private $junkDict = array();
  60. /**
  61. * @var array Array of indices that do not contain junk elements.
  62. */
  63. private $b2j = array();
  64. private $options = array();
  65. private $defaultOptions = array(
  66. 'ignoreNewLines' => false,
  67. 'ignoreWhitespace' => false,
  68. 'ignoreCase' => false
  69. );
  70. /**
  71. * The constructor. With the sequences being passed, they'll be set for the
  72. * sequence matcher and it will perform a basic cleanup & calculate junk
  73. * elements.
  74. *
  75. * @param string|array $a A string or array containing the lines to compare against.
  76. * @param string|array $b A string or array containing the lines to compare.
  77. * @param string|array $junkCallback Either an array or string that references a callback function (if there is one) to determine 'junk' characters.
  78. * @param array $options
  79. */
  80. public function __construct($a, $b, $junkCallback=null, $options)
  81. {
  82. $this->a = null;
  83. $this->b = null;
  84. $this->junkCallback = $junkCallback;
  85. $this->setOptions($options);
  86. $this->setSequences($a, $b);
  87. }
  88. /**
  89. * Set new options
  90. *
  91. * @param array $options
  92. */
  93. public function setOptions($options)
  94. {
  95. $this->options = array_merge($this->defaultOptions, $options);
  96. }
  97. /**
  98. * Set the first and second sequences to use with the sequence matcher.
  99. *
  100. * @param string|array $a A string or array containing the lines to compare against.
  101. * @param string|array $b A string or array containing the lines to compare.
  102. */
  103. public function setSequences($a, $b)
  104. {
  105. $this->setSeq1($a);
  106. $this->setSeq2($b);
  107. }
  108. /**
  109. * Set the first sequence ($a) and reset any internal caches to indicate that
  110. * when calling the calculation methods, we need to recalculate them.
  111. *
  112. * @param string|array $a The sequence to set as the first sequence.
  113. */
  114. public function setSeq1($a)
  115. {
  116. if(!is_array($a)) {
  117. $a = str_split($a);
  118. }
  119. if($a == $this->a) {
  120. return;
  121. }
  122. $this->a= $a;
  123. $this->matchingBlocks = null;
  124. $this->opCodes = null;
  125. }
  126. /**
  127. * Set the second sequence ($b) and reset any internal caches to indicate that
  128. * when calling the calculation methods, we need to recalculate them.
  129. *
  130. * @param string|array $b The sequence to set as the second sequence.
  131. */
  132. public function setSeq2($b)
  133. {
  134. if(!is_array($b)) {
  135. $b = str_split($b);
  136. }
  137. if($b == $this->b) {
  138. return;
  139. }
  140. $this->b = $b;
  141. $this->matchingBlocks = null;
  142. $this->opCodes = null;
  143. $this->fullBCount = null;
  144. $this->chainB();
  145. }
  146. /**
  147. * Generate the internal arrays containing the list of junk and non-junk
  148. * characters for the second ($b) sequence.
  149. */
  150. private function chainB()
  151. {
  152. $length = count ($this->b);
  153. $this->b2j = array();
  154. $popularDict = array();
  155. for($i = 0; $i < $length; ++$i) {
  156. $char = $this->b[$i];
  157. if(isset($this->b2j[$char])) {
  158. if($length >= 200 && count($this->b2j[$char]) * 100 > $length) {
  159. $popularDict[$char] = 1;
  160. unset($this->b2j[$char]);
  161. }
  162. else {
  163. $this->b2j[$char][] = $i;
  164. }
  165. }
  166. else {
  167. $this->b2j[$char] = array(
  168. $i
  169. );
  170. }
  171. }
  172. // Remove leftovers
  173. foreach(array_keys($popularDict) as $char) {
  174. unset($this->b2j[$char]);
  175. }
  176. $this->junkDict = array();
  177. if(is_callable($this->junkCallback)) {
  178. foreach(array_keys($popularDict) as $char) {
  179. if(call_user_func($this->junkCallback, $char)) {
  180. $this->junkDict[$char] = 1;
  181. unset($popularDict[$char]);
  182. }
  183. }
  184. foreach(array_keys($this->b2j) as $char) {
  185. if(call_user_func($this->junkCallback, $char)) {
  186. $this->junkDict[$char] = 1;
  187. unset($this->b2j[$char]);
  188. }
  189. }
  190. }
  191. }
  192. /**
  193. * Checks if a particular character is in the junk dictionary
  194. * for the list of junk characters.
  195. * @param $b
  196. * @return boolean True if the character is considered junk. False if not.
  197. */
  198. private function isBJunk($b)
  199. {
  200. if(isset($this->juncDict[$b])) {
  201. return true;
  202. }
  203. return false;
  204. }
  205. /**
  206. * Find the longest matching block in the two sequences, as defined by the
  207. * lower and upper constraints for each sequence. (for the first sequence,
  208. * $alo - $ahi and for the second sequence, $blo - $bhi)
  209. *
  210. * Essentially, of all of the maximal matching blocks, return the one that
  211. * startest earliest in $a, and all of those maximal matching blocks that
  212. * start earliest in $a, return the one that starts earliest in $b.
  213. *
  214. * If the junk callback is defined, do the above but with the restriction
  215. * that the junk element appears in the block. Extend it as far as possible
  216. * by matching only junk elements in both $a and $b.
  217. *
  218. * @param int $alo The lower constraint for the first sequence.
  219. * @param int $ahi The upper constraint for the first sequence.
  220. * @param int $blo The lower constraint for the second sequence.
  221. * @param int $bhi The upper constraint for the second sequence.
  222. * @return array Array containing the longest match that includes the starting position in $a, start in $b and the length/size.
  223. */
  224. public function findLongestMatch($alo, $ahi, $blo, $bhi)
  225. {
  226. $a = $this->a;
  227. $b = $this->b;
  228. $bestI = $alo;
  229. $bestJ = $blo;
  230. $bestSize = 0;
  231. $j2Len = array();
  232. $nothing = array();
  233. for($i = $alo; $i < $ahi; ++$i) {
  234. $newJ2Len = array();
  235. $jDict = $this->arrayGetDefault($this->b2j, $a[$i], $nothing);
  236. foreach($jDict as $jKey => $j) {
  237. if($j < $blo) {
  238. continue;
  239. }
  240. else if($j >= $bhi) {
  241. break;
  242. }
  243. $k = $this->arrayGetDefault($j2Len, $j -1, 0) + 1;
  244. $newJ2Len[$j] = $k;
  245. if($k > $bestSize) {
  246. $bestI = $i - $k + 1;
  247. $bestJ = $j - $k + 1;
  248. $bestSize = $k;
  249. }
  250. }
  251. $j2Len = $newJ2Len;
  252. }
  253. while($bestI > $alo && $bestJ > $blo && !$this->isBJunk($b[$bestJ - 1]) &&
  254. !$this->linesAreDifferent($bestI - 1, $bestJ - 1)) {
  255. --$bestI;
  256. --$bestJ;
  257. ++$bestSize;
  258. }
  259. while($bestI + $bestSize < $ahi && ($bestJ + $bestSize) < $bhi &&
  260. !$this->isBJunk($b[$bestJ + $bestSize]) && !$this->linesAreDifferent($bestI + $bestSize, $bestJ + $bestSize)) {
  261. ++$bestSize;
  262. }
  263. while($bestI > $alo && $bestJ > $blo && $this->isBJunk($b[$bestJ - 1]) &&
  264. !$this->isLineDifferent($bestI - 1, $bestJ - 1)) {
  265. --$bestI;
  266. --$bestJ;
  267. ++$bestSize;
  268. }
  269. while($bestI + $bestSize < $ahi && $bestJ + $bestSize < $bhi &&
  270. $this->isBJunk($b[$bestJ + $bestSize]) && !$this->linesAreDifferent($bestI + $bestSize, $bestJ + $bestSize)) {
  271. ++$bestSize;
  272. }
  273. return array(
  274. $bestI,
  275. $bestJ,
  276. $bestSize
  277. );
  278. }
  279. /**
  280. * Check if the two lines at the given indexes are different or not.
  281. *
  282. * @param int $aIndex Line number to check against in a.
  283. * @param int $bIndex Line number to check against in b.
  284. * @return boolean True if the lines are different and false if not.
  285. */
  286. public function linesAreDifferent($aIndex, $bIndex)
  287. {
  288. $lineA = $this->a[$aIndex];
  289. $lineB = $this->b[$bIndex];
  290. if($this->options['ignoreWhitespace']) {
  291. $replace = array("\t", ' ');
  292. $lineA = str_replace($replace, '', $lineA);
  293. $lineB = str_replace($replace, '', $lineB);
  294. }
  295. if($this->options['ignoreCase']) {
  296. $lineA = strtolower($lineA);
  297. $lineB = strtolower($lineB);
  298. }
  299. if($lineA != $lineB) {
  300. return true;
  301. }
  302. return false;
  303. }
  304. /**
  305. * Return a nested set of arrays for all of the matching sub-sequences
  306. * in the strings $a and $b.
  307. *
  308. * Each block contains the lower constraint of the block in $a, the lower
  309. * constraint of the block in $b and finally the number of lines that the
  310. * block continues for.
  311. *
  312. * @return array Nested array of the matching blocks, as described by the function.
  313. */
  314. public function getMatchingBlocks()
  315. {
  316. if(!empty($this->matchingBlocks)) {
  317. return $this->matchingBlocks;
  318. }
  319. $aLength = count($this->a);
  320. $bLength = count($this->b);
  321. $queue = array(
  322. array(
  323. 0,
  324. $aLength,
  325. 0,
  326. $bLength
  327. )
  328. );
  329. $matchingBlocks = array();
  330. while(!empty($queue)) {
  331. list($alo, $ahi, $blo, $bhi) = array_pop($queue);
  332. $x = $this->findLongestMatch($alo, $ahi, $blo, $bhi);
  333. list($i, $j, $k) = $x;
  334. if($k) {
  335. $matchingBlocks[] = $x;
  336. if($alo < $i && $blo < $j) {
  337. $queue[] = array(
  338. $alo,
  339. $i,
  340. $blo,
  341. $j
  342. );
  343. }
  344. if($i + $k < $ahi && $j + $k < $bhi) {
  345. $queue[] = array(
  346. $i + $k,
  347. $ahi,
  348. $j + $k,
  349. $bhi
  350. );
  351. }
  352. }
  353. }
  354. usort($matchingBlocks, array($this, 'tupleSort'));
  355. $i1 = 0;
  356. $j1 = 0;
  357. $k1 = 0;
  358. $nonAdjacent = array();
  359. foreach($matchingBlocks as $block) {
  360. list($i2, $j2, $k2) = $block;
  361. if($i1 + $k1 == $i2 && $j1 + $k1 == $j2) {
  362. $k1 += $k2;
  363. }
  364. else {
  365. if($k1) {
  366. $nonAdjacent[] = array(
  367. $i1,
  368. $j1,
  369. $k1
  370. );
  371. }
  372. $i1 = $i2;
  373. $j1 = $j2;
  374. $k1 = $k2;
  375. }
  376. }
  377. if($k1) {
  378. $nonAdjacent[] = array(
  379. $i1,
  380. $j1,
  381. $k1
  382. );
  383. }
  384. $nonAdjacent[] = array(
  385. $aLength,
  386. $bLength,
  387. 0
  388. );
  389. $this->matchingBlocks = $nonAdjacent;
  390. return $this->matchingBlocks;
  391. }
  392. /**
  393. * Return a list of all of the opcodes for the differences between the
  394. * two strings.
  395. *
  396. * The nested array returned contains an array describing the opcode
  397. * which includes:
  398. * 0 - The type of tag (as described below) for the opcode.
  399. * 1 - The beginning line in the first sequence.
  400. * 2 - The end line in the first sequence.
  401. * 3 - The beginning line in the second sequence.
  402. * 4 - The end line in the second sequence.
  403. *
  404. * The different types of tags include:
  405. * replace - The string from $i1 to $i2 in $a should be replaced by
  406. * the string in $b from $j1 to $j2.
  407. * delete - The string in $a from $i1 to $j2 should be deleted.
  408. * insert - The string in $b from $j1 to $j2 should be inserted at
  409. * $i1 in $a.
  410. * equal - The two strings with the specified ranges are equal.
  411. *
  412. * @return array Array of the opcodes describing the differences between the strings.
  413. */
  414. public function getOpCodes()
  415. {
  416. if(!empty($this->opCodes)) {
  417. return $this->opCodes;
  418. }
  419. $i = 0;
  420. $j = 0;
  421. $this->opCodes = array();
  422. $blocks = $this->getMatchingBlocks();
  423. foreach($blocks as $block) {
  424. list($ai, $bj, $size) = $block;
  425. $tag = '';
  426. if($i < $ai && $j < $bj) {
  427. $tag = 'replace';
  428. }
  429. else if($i < $ai) {
  430. $tag = 'delete';
  431. }
  432. else if($j < $bj) {
  433. $tag = 'insert';
  434. }
  435. if($tag) {
  436. $this->opCodes[] = array(
  437. $tag,
  438. $i,
  439. $ai,
  440. $j,
  441. $bj
  442. );
  443. }
  444. $i = $ai + $size;
  445. $j = $bj + $size;
  446. if($size) {
  447. $this->opCodes[] = array(
  448. 'equal',
  449. $ai,
  450. $i,
  451. $bj,
  452. $j
  453. );
  454. }
  455. }
  456. return $this->opCodes;
  457. }
  458. /**
  459. * Return a series of nested arrays containing different groups of generated
  460. * opcodes for the differences between the strings with up to $context lines
  461. * of surrounding content.
  462. *
  463. * Essentially what happens here is any big equal blocks of strings are stripped
  464. * out, the smaller subsets of changes are then arranged in to their groups.
  465. * This means that the sequence matcher and diffs do not need to include the full
  466. * content of the different files but can still provide context as to where the
  467. * changes are.
  468. *
  469. * @param int $context The number of lines of context to provide around the groups.
  470. * @return array Nested array of all of the grouped opcodes.
  471. */
  472. public function getGroupedOpcodes($context=3)
  473. {
  474. $opCodes = $this->getOpCodes();
  475. if(empty($opCodes)) {
  476. $opCodes = array(
  477. array(
  478. 'equal',
  479. 0,
  480. 1,
  481. 0,
  482. 1
  483. )
  484. );
  485. }
  486. if($opCodes[0][0] == 'equal') {
  487. $opCodes[0] = array(
  488. $opCodes[0][0],
  489. max($opCodes[0][1], $opCodes[0][2] - $context),
  490. $opCodes[0][2],
  491. max($opCodes[0][3], $opCodes[0][4] - $context),
  492. $opCodes[0][4]
  493. );
  494. }
  495. $lastItem = count($opCodes) - 1;
  496. if($opCodes[$lastItem][0] == 'equal') {
  497. list($tag, $i1, $i2, $j1, $j2) = $opCodes[$lastItem];
  498. $opCodes[$lastItem] = array(
  499. $tag,
  500. $i1,
  501. min($i2, $i1 + $context),
  502. $j1,
  503. min($j2, $j1 + $context)
  504. );
  505. }
  506. $maxRange = $context * 2;
  507. $groups = array();
  508. $group = array();
  509. foreach($opCodes as $code) {
  510. list($tag, $i1, $i2, $j1, $j2) = $code;
  511. if($tag == 'equal' && $i2 - $i1 > $maxRange) {
  512. $group[] = array(
  513. $tag,
  514. $i1,
  515. min($i2, $i1 + $context),
  516. $j1,
  517. min($j2, $j1 + $context)
  518. );
  519. $groups[] = $group;
  520. $group = array();
  521. $i1 = max($i1, $i2 - $context);
  522. $j1 = max($j1, $j2 - $context);
  523. }
  524. $group[] = array(
  525. $tag,
  526. $i1,
  527. $i2,
  528. $j1,
  529. $j2
  530. );
  531. }
  532. if(!empty($group) && !(count($group) == 1 && $group[0][0] == 'equal')) {
  533. $groups[] = $group;
  534. }
  535. return $groups;
  536. }
  537. /**
  538. * Return a measure of the similarity between the two sequences.
  539. * This will be a float value between 0 and 1.
  540. *
  541. * Out of all of the ratio calculation functions, this is the most
  542. * expensive to call if getMatchingBlocks or getOpCodes is yet to be
  543. * called. The other calculation methods (quickRatio and realquickRatio)
  544. * can be used to perform quicker calculations but may be less accurate.
  545. *
  546. * The ratio is calculated as (2 * number of matches) / total number of
  547. * elements in both sequences.
  548. *
  549. * @return float The calculated ratio.
  550. */
  551. public function Ratio()
  552. {
  553. $matches = array_reduce($this->getMatchingBlocks(), array($this, 'ratioReduce'), 0);
  554. return $this->calculateRatio($matches, count ($this->a) + count ($this->b));
  555. }
  556. /**
  557. * Helper function to calculate the number of matches for Ratio().
  558. *
  559. * @param int $sum The running total for the number of matches.
  560. * @param array $triple Array containing the matching block triple to add to the running total.
  561. * @return int The new running total for the number of matches.
  562. */
  563. private function ratioReduce($sum, $triple)
  564. {
  565. return $sum + ($triple[count($triple) - 1]);
  566. }
  567. /**
  568. * Quickly return an upper bound ratio for the similarity of the strings.
  569. * This is quicker to compute than Ratio().
  570. *
  571. * @return float The calculated ratio.
  572. */
  573. private function quickRatio()
  574. {
  575. if($this->fullBCount === null) {
  576. $this->fullBCount = array();
  577. $bLength = count ($this->b);
  578. for($i = 0; $i < $bLength; ++$i) {
  579. $char = $this->b[$i];
  580. $this->fullBCount[$char] = $this->arrayGetDefault($this->fullBCount, $char, 0) + 1;
  581. }
  582. }
  583. $avail = array();
  584. $matches = 0;
  585. $aLength = count ($this->a);
  586. for($i = 0; $i < $aLength; ++$i) {
  587. $char = $this->a[$i];
  588. if(isset($avail[$char])) {
  589. $numb = $avail[$char];
  590. }
  591. else {
  592. $numb = $this->arrayGetDefault($this->fullBCount, $char, 0);
  593. }
  594. $avail[$char] = $numb - 1;
  595. if($numb > 0) {
  596. ++$matches;
  597. }
  598. }
  599. $this->calculateRatio($matches, count ($this->a) + count ($this->b));
  600. }
  601. /**
  602. * Return an upper bound ratio really quickly for the similarity of the strings.
  603. * This is quicker to compute than Ratio() and quickRatio().
  604. *
  605. * @return float The calculated ratio.
  606. */
  607. private function realquickRatio()
  608. {
  609. $aLength = count ($this->a);
  610. $bLength = count ($this->b);
  611. return $this->calculateRatio(min($aLength, $bLength), $aLength + $bLength);
  612. }
  613. /**
  614. * Helper function for calculating the ratio to measure similarity for the strings.
  615. * The ratio is defined as being 2 * (number of matches / total length)
  616. *
  617. * @param int $matches The number of matches in the two strings.
  618. * @param int $length The length of the two strings.
  619. * @return float The calculated ratio.
  620. */
  621. private function calculateRatio($matches, $length=0)
  622. {
  623. if($length) {
  624. return 2 * ($matches / $length);
  625. }
  626. else {
  627. return 1;
  628. }
  629. }
  630. /**
  631. * Helper function that provides the ability to return the value for a key
  632. * in an array of it exists, or if it doesn't then return a default value.
  633. * Essentially cleaner than doing a series of if(isset()) {} else {} calls.
  634. *
  635. * @param array $array The array to search.
  636. * @param string $key The key to check that exists.
  637. * @param mixed $default The value to return as the default value if the key doesn't exist.
  638. * @return mixed The value from the array if the key exists or otherwise the default.
  639. */
  640. private function arrayGetDefault($array, $key, $default)
  641. {
  642. if(isset($array[$key])) {
  643. return $array[$key];
  644. }
  645. else {
  646. return $default;
  647. }
  648. }
  649. /**
  650. * Sort an array by the nested arrays it contains. Helper function for getMatchingBlocks
  651. *
  652. * @param array $a First array to compare.
  653. * @param array $b Second array to compare.
  654. * @return int -1, 0 or 1, as expected by the usort function.
  655. */
  656. private function tupleSort($a, $b)
  657. {
  658. $max = max(count($a), count($b));
  659. for($i = 0; $i < $max; ++$i) {
  660. if($a[$i] < $b[$i]) {
  661. return -1;
  662. }
  663. else if($a[$i] > $b[$i]) {
  664. return 1;
  665. }
  666. }
  667. if(count($a) == count($b)) {
  668. return 0;
  669. }
  670. else if(count($a) < count($b)) {
  671. return -1;
  672. }
  673. else {
  674. return 1;
  675. }
  676. }
  677. }