|
- <?php
-
- /**
- * Parses a URI into the components and fragment identifier as specified
- * by RFC 3986.
- */
- class HTMLPurifier_URIParser
- {
-
- /**
- * Instance of HTMLPurifier_PercentEncoder to do normalization with.
- */
- protected $percentEncoder;
-
- public function __construct()
- {
- $this->percentEncoder = new HTMLPurifier_PercentEncoder();
- }
-
- /**
- * Parses a URI.
- * @param $uri string URI to parse
- * @return HTMLPurifier_URI representation of URI. This representation has
- * not been validated yet and may not conform to RFC.
- */
- public function parse($uri)
- {
- $uri = $this->percentEncoder->normalize($uri);
-
- // Regexp is as per Appendix B.
- // Note that ["<>] are an addition to the RFC's recommended
- // characters, because they represent external delimeters.
- $r_URI = '!'.
- '(([a-zA-Z0-9\.\+\-]+):)?'. // 2. Scheme
- '(//([^/?#"<>]*))?'. // 4. Authority
- '([^?#"<>]*)'. // 5. Path
- '(\?([^#"<>]*))?'. // 7. Query
- '(#([^"<>]*))?'. // 8. Fragment
- '!';
-
- $matches = array();
- $result = preg_match($r_URI, $uri, $matches);
-
- if (!$result) return false; // *really* invalid URI
-
- // seperate out parts
- $scheme = !empty($matches[1]) ? $matches[2] : null;
- $authority = !empty($matches[3]) ? $matches[4] : null;
- $path = $matches[5]; // always present, can be empty
- $query = !empty($matches[6]) ? $matches[7] : null;
- $fragment = !empty($matches[8]) ? $matches[9] : null;
-
- // further parse authority
- if ($authority !== null) {
- $r_authority = "/^((.+?)@)?(\[[^\]]+\]|[^:]*)(:(\d*))?/";
- $matches = array();
- preg_match($r_authority, $authority, $matches);
- $userinfo = !empty($matches[1]) ? $matches[2] : null;
- $host = !empty($matches[3]) ? $matches[3] : '';
- $port = !empty($matches[4]) ? (int) $matches[5] : null;
- } else {
- $port = $host = $userinfo = null;
- }
-
- return new HTMLPurifier_URI(
- $scheme, $userinfo, $host, $port, $path, $query, $fragment);
- }
-
- }
-
- // vim: et sw=4 sts=4
|