177 lines
5.4 KiB
PHP
177 lines
5.4 KiB
PHP
<?php
|
|
/**
|
|
* Class Google\Site_Kit\Core\Util\URL
|
|
*
|
|
* @package Google\Site_Kit\Core\Util
|
|
* @copyright 2022 Google LLC
|
|
* @license https://www.apache.org/licenses/LICENSE-2.0 Apache License 2.0
|
|
* @link https://sitekit.withgoogle.com
|
|
*/
|
|
|
|
namespace Google\Site_Kit\Core\Util;
|
|
|
|
/**
|
|
* Class for custom URL parsing methods.
|
|
*
|
|
* @since 1.84.0
|
|
* @access private
|
|
* @ignore
|
|
*/
|
|
class URL {
|
|
|
|
/**
|
|
* Prefix for Punycode-encoded hostnames.
|
|
*/
|
|
const PUNYCODE_PREFIX = 'xn--';
|
|
|
|
/**
|
|
* Parses URLs with UTF-8 multi-byte characters,
|
|
* otherwise similar to `wp_parse_url()`.
|
|
*
|
|
* @since 1.84.0
|
|
*
|
|
* @param string $url The URL to parse.
|
|
* @param int $component The specific component to retrieve. Use one of the PHP
|
|
* predefined constants to specify which one.
|
|
* Defaults to -1 (= return all parts as an array).
|
|
* @return mixed False on parse failure; Array of URL components on success;
|
|
* When a specific component has been requested: null if the component
|
|
* doesn't exist in the given URL; a string or - in the case of
|
|
* PHP_URL_PORT - integer when it does. See parse_url()'s return values.
|
|
*/
|
|
public static function parse( $url, $component = -1 ) {
|
|
$url = (string) $url;
|
|
|
|
if ( mb_strlen( $url, 'UTF-8' ) === strlen( $url ) ) {
|
|
return wp_parse_url( $url, $component );
|
|
}
|
|
|
|
$to_unset = array();
|
|
if ( '//' === mb_substr( $url, 0, 2 ) ) {
|
|
$to_unset[] = 'scheme';
|
|
$url = 'placeholder:' . $url;
|
|
} elseif ( '/' === mb_substr( $url, 0, 1 ) ) {
|
|
$to_unset[] = 'scheme';
|
|
$to_unset[] = 'host';
|
|
$url = 'placeholder://placeholder' . $url;
|
|
}
|
|
|
|
$parts = self::mb_parse_url( $url );
|
|
|
|
if ( false === $parts ) {
|
|
// Parsing failure.
|
|
return $parts;
|
|
}
|
|
|
|
// Remove the placeholder values.
|
|
foreach ( $to_unset as $key ) {
|
|
unset( $parts[ $key ] );
|
|
}
|
|
|
|
return _get_component_from_parsed_url_array( $parts, $component );
|
|
}
|
|
|
|
/**
|
|
* Replacement for parse_url which is UTF-8 multi-byte character aware.
|
|
*
|
|
* @since 1.84.0
|
|
*
|
|
* @param string $url The URL to parse.
|
|
* @return mixed False on parse failure; Array of URL components on success
|
|
*/
|
|
private static function mb_parse_url( $url ) {
|
|
$enc_url = preg_replace_callback(
|
|
'%[^:/@?&=#]+%usD',
|
|
function ( $matches ) {
|
|
return rawurlencode( $matches[0] );
|
|
},
|
|
$url
|
|
);
|
|
|
|
$parts = parse_url( $enc_url ); // phpcs:ignore WordPress.WP.AlternativeFunctions.parse_url_parse_url
|
|
|
|
if ( false === $parts ) {
|
|
return $parts;
|
|
}
|
|
|
|
foreach ( $parts as $name => $value ) {
|
|
$parts[ $name ] = urldecode( $value );
|
|
}
|
|
|
|
return $parts;
|
|
}
|
|
|
|
/**
|
|
* Permutes site URL to cover all different variants of it (not considering the path).
|
|
*
|
|
* @since 1.99.0
|
|
*
|
|
* @param string $site_url Site URL to get permutations for.
|
|
* @return array List of permutations.
|
|
*/
|
|
public static function permute_site_url( $site_url ) {
|
|
$hostname = self::parse( $site_url, PHP_URL_HOST );
|
|
$path = self::parse( $site_url, PHP_URL_PATH );
|
|
|
|
return array_reduce(
|
|
self::permute_site_hosts( $hostname ),
|
|
function ( $urls, $host ) use ( $path ) {
|
|
$host_with_path = $host . $path;
|
|
array_push( $urls, "https://$host_with_path", "http://$host_with_path" );
|
|
return $urls;
|
|
},
|
|
array()
|
|
);
|
|
}
|
|
|
|
/**
|
|
* Generates common variations of the given hostname.
|
|
*
|
|
* Returns a list of hostnames that includes:
|
|
* - (if IDN) in Punycode encoding
|
|
* - (if IDN) in Unicode encoding
|
|
* - with and without www. subdomain (including IDNs)
|
|
*
|
|
* @since 1.99.0
|
|
*
|
|
* @param string $hostname Hostname to generate variations of.
|
|
* @return string[] Hostname variations.
|
|
*/
|
|
public static function permute_site_hosts( $hostname ) {
|
|
if ( ! $hostname || ! is_string( $hostname ) ) {
|
|
return array();
|
|
}
|
|
|
|
// See \Requests_IDNAEncoder::is_ascii.
|
|
$is_ascii = preg_match( '/(?:[^\x00-\x7F])/', $hostname ) !== 1;
|
|
$is_www = 0 === strpos( $hostname, 'www.' );
|
|
// Normalize hostname without www.
|
|
$hostname = $is_www ? substr( $hostname, strlen( 'www.' ) ) : $hostname;
|
|
$hosts = array( $hostname, "www.$hostname" );
|
|
|
|
try {
|
|
// An ASCII hostname can only be non-IDN or punycode-encoded.
|
|
if ( $is_ascii ) {
|
|
// If the hostname is in punycode encoding, add the decoded version to the list of hosts.
|
|
if ( 0 === strpos( $hostname, self::PUNYCODE_PREFIX ) || false !== strpos( $hostname, '.' . self::PUNYCODE_PREFIX ) ) {
|
|
// Ignoring phpcs here, and not passing the variant so that the correct default can be selected by PHP based on the
|
|
// version. INTL_IDNA_VARIANT_UTS46 for PHP>=7.4, INTL_IDNA_VARIANT_2003 for PHP<7.4.
|
|
// phpcs:ignore PHPCompatibility.ParameterValues.NewIDNVariantDefault.NotSet
|
|
$host_decoded = idn_to_utf8( $hostname );
|
|
array_push( $hosts, $host_decoded, "www.$host_decoded" );
|
|
}
|
|
} else {
|
|
// If it's not ASCII, then add the punycode encoded version.
|
|
// Ignoring phpcs here, and not passing the variant so that the correct default can be selected by PHP based on the
|
|
// version. INTL_IDNA_VARIANT_UTS46 for PHP>=7.4, INTL_IDNA_VARIANT_2003 for PHP<7.4.
|
|
// phpcs:ignore PHPCompatibility.ParameterValues.NewIDNVariantDefault.NotSet
|
|
$host_encoded = idn_to_ascii( $hostname );
|
|
array_push( $hosts, $host_encoded, "www.$host_encoded" );
|
|
}
|
|
} catch ( Exception $exception ) { // phpcs:ignore Generic.CodeAnalysis.EmptyStatement.DetectedCatch
|
|
// Do nothing.
|
|
}
|
|
|
|
return $hosts;
|
|
}
|
|
}
|