close
Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion src/wp-includes/SimplePie/src/Misc.php
Original file line number Diff line number Diff line change
Expand Up @@ -1875,7 +1875,11 @@ public static function atom_10_content_construct_type($attribs)

public static function is_isegment_nz_nc($string)
{
return (bool) preg_match('/^([A-Za-z0-9\-._~\x{A0}-\x{D7FF}\x{F900}-\x{FDCF}\x{FDF0}-\x{FFEF}\x{10000}-\x{1FFFD}\x{20000}-\x{2FFFD}\x{30000}-\x{3FFFD}\x{40000}-\x{4FFFD}\x{50000}-\x{5FFFD}\x{60000}-\x{6FFFD}\x{70000}-\x{7FFFD}\x{80000}-\x{8FFFD}\x{90000}-\x{9FFFD}\x{A0000}-\x{AFFFD}\x{B0000}-\x{BFFFD}\x{C0000}-\x{CFFFD}\x{D0000}-\x{DFFFD}\x{E1000}-\x{EFFFD}!$&\'()*+,;=@]|(%[0-9ABCDEF]{2}))+$/u', $string);
if ( _wp_can_use_pcre_u() ) {
return (bool) preg_match('/^([A-Za-z0-9\-._~\x{A0}-\x{D7FF}\x{F900}-\x{FDCF}\x{FDF0}-\x{FFEF}\x{10000}-\x{1FFFD}\x{20000}-\x{2FFFD}\x{30000}-\x{3FFFD}\x{40000}-\x{4FFFD}\x{50000}-\x{5FFFD}\x{60000}-\x{6FFFD}\x{70000}-\x{7FFFD}\x{80000}-\x{8FFFD}\x{90000}-\x{9FFFD}\x{A0000}-\x{AFFFD}\x{B0000}-\x{BFFFD}\x{C0000}-\x{CFFFD}\x{D0000}-\x{DFFFD}\x{E1000}-\x{EFFFD}!$&\'()*+,;=@]|(%[0-9ABCDEF]{2}))+$/u', $string);
} else {
return (bool) preg_match('/^([A-Za-z0-9\-._~!$&\'()*+,;=@]|(%[0-9ABCDEF]{2}))+$/', $string);
}
}

public static function space_separated_tokens($string)
Expand Down
10 changes: 8 additions & 2 deletions src/wp-includes/class-wp-plugin-dependencies.php
Original file line number Diff line number Diff line change
Expand Up @@ -603,8 +603,14 @@ protected static function sanitize_dependency_slugs( $slugs ) {
$slug = apply_filters( 'wp_plugin_dependencies_slug', $slug );

// Match to WordPress.org slug format.
if ( preg_match( '/^[a-z0-9]+(-[a-z0-9]+)*$/mu', $slug ) ) {
$sanitized_slugs[] = $slug;
if ( _wp_can_use_pcre_u() ) {
if ( preg_match( '/^[a-z0-9]+(-[a-z0-9]+)*$/mu', $slug ) ) {
$sanitized_slugs[] = $slug;
}
} else {
if ( preg_match( '/^[a-z0-9]+(-[a-z0-9]+)*$/m', $slug ) ) {
$sanitized_slugs[] = $slug;
}
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this PCRE pattern is only looking at US-ASCII characters and doesn’t even need the UTF-8 flag. do you see any reason not to update this simply to remove the flag?

}
}
$sanitized_slugs = array_unique( $sanitized_slugs );
Expand Down
31 changes: 29 additions & 2 deletions src/wp-includes/class-wp-text-diff-renderer-inline.php
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,35 @@
*/
public function _splitOnWords( $string, $newlineEscape = "\n" ) { // phpcs:ignore Universal.NamingConventions.NoReservedKeywordParameterNames.stringFound,WordPress.NamingConventions.ValidVariableName.VariableNotSnakeCase
$string = str_replace( "\0", '', $string );
$words = preg_split( '/([^\w])/u', $string, -1, PREG_SPLIT_DELIM_CAPTURE );
$words = str_replace( "\n", $newlineEscape, $words ); // phpcs:ignore WordPress.NamingConventions.ValidVariableName.VariableNotSnakeCase
if ( _wp_can_use_pcre_u() ) {
$words = preg_split( '/([^\w])/u', $string, -1, PREG_SPLIT_DELIM_CAPTURE );
} else {
if ( function_exists( 'mb_str_split' ) ) {
$chars = mb_str_split( $string, 1, 'UTF-8' );

Check failure on line 33 in src/wp-includes/class-wp-text-diff-renderer-inline.php

View workflow job for this annotation

Image GitHub Actions / Check PHP compatibility / Run compatibility checks

The function mb_str_split() is not present in PHP version 7.3 or earlier
} else {
$chars = str_split( $string );
}
$words = array();
$current_word = '';

foreach ( $chars as $char ) {
// Simple heuristic: letters, numbers, underscore = word characters
if ( ctype_alnum( $char ) || '_' === $char || ord( $char ) > 127 ) {
$current_word .= $char;
} else {
if ( '' !== $current_word ) {
$words[] = $current_word;
$current_word = '';
}
$words[] = $char; // Capture delimiter
}
}
if ( '' !== $current_word ) {
$words[] = $current_word;
}
}

$words = str_replace( "\n", $newlineEscape, $words ); // phpcs:ignore WordPress.NamingConventions.ValidVariableName.VariableNotSnakeCase
return $words;
}
}
12 changes: 10 additions & 2 deletions src/wp-includes/comment.php
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,11 @@ function check_comment( $author, $email, $url, $comment, $user_ip, $user_agent,
* Check the comment fields for moderation keywords. If any are found,
* fail the check for the given field by returning false.
*/
$pattern = "#$word#iu";
if ( _wp_can_use_pcre_u() ) {
$pattern = "#$word#iu";
} else {
$pattern = "#$word#i";
}
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I suspect this could be another case where it’s okay to remove the UTF-8 flag, because whatever $word is, it’s going to appear here as bytes, not as source code. that means it’s already matching sequences of the requested bytes/text.

it would be good to verify this. one setup would be to have PHP using an internal_encoding of latin1 (if that’s even possible, I can’t remember if changing the internal encoding has been removed) and then testing "b\xC3\xBCch" against "#b\xFCch#i. if these match then the PCRE functions are converting text before matching. if they don’t match, I think we can probably remove the flag.

if ( preg_match( $pattern, $author ) ) {
return false;
}
Expand Down Expand Up @@ -1415,7 +1419,11 @@ function wp_check_comment_disallowed_list( $author, $email, $url, $comment, $use
// Do some escaping magic so that '#' chars in the spam words don't break things:
$word = preg_quote( $word, '#' );

$pattern = "#$word#iu";
if ( _wp_can_use_pcre_u() ) {
$pattern = "#$word#iu";
} else {
$pattern = "#$word#i";
}
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same as above: is the u flag necessary here given that we’re injecting runtime bytes into the pattern and not attempting to translate source code?

if ( preg_match( $pattern, $author )
|| preg_match( $pattern, $email )
|| preg_match( $pattern, $url )
Expand Down
25 changes: 13 additions & 12 deletions src/wp-includes/formatting.php
Original file line number Diff line number Diff line change
Expand Up @@ -1048,7 +1048,7 @@
// Valid two-byte code points.

if ( $b1 >= 0xC2 && $b1 <= 0xDF && $b2 >= 0x80 && $b2 <= 0xBF ) {
$i++;
++$i;
continue;
}

Expand Down Expand Up @@ -1289,19 +1289,12 @@
return $text;
}

// Check for support for utf8 in the installed PCRE library once and store the result in a static.
static $utf8_pcre = null;
if ( ! isset( $utf8_pcre ) ) {
// phpcs:ignore WordPress.PHP.NoSilencedErrors.Discouraged
$utf8_pcre = @preg_match( '/^./u', 'a' );
}
// We can't demand utf8 in the PCRE installation, so just return the string in those cases.
if ( ! $utf8_pcre ) {
// Check for support for utf8 in the installed PCRE library.
if ( ! _wp_can_use_pcre_u() ) {
return $text;
}

// phpcs:ignore WordPress.PHP.NoSilencedErrors.Discouraged -- preg_match fails when it encounters invalid UTF8 in $text.
if ( 1 === @preg_match( '/^./us', $text ) ) {
if ( 1 === preg_match( '/^./us', $text ) ) {
return $text;
}
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this whole function has been updated in trunk. these changes are no longer relevant.


Expand Down Expand Up @@ -4233,7 +4226,15 @@

if ( str_starts_with( wp_get_word_count_type(), 'characters' ) && preg_match( '/^utf\-?8$/i', get_option( 'blog_charset' ) ) ) {
$text = trim( preg_replace( "/[\n\r\t ]+/", ' ', $text ), ' ' );
preg_match_all( '/./u', $text, $words_array );
if ( _wp_can_use_pcre_u() ) {
preg_match_all( '/./u', $text, $words_array );
} else {
if ( function_exists( 'mb_str_split' ) ) {
$words_array = array( mb_str_split( $text, 1, 'UTF-8' ) );

Check failure on line 4233 in src/wp-includes/formatting.php

View workflow job for this annotation

Image GitHub Actions / Check PHP compatibility / Run compatibility checks

The function mb_str_split() is not present in PHP version 7.3 or earlier
} else {
$words_array = array( str_split( $text ) );
}
}
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have this function slated for much bigger updates. I would recommend against updating the PCRE usage here because of that.

$words_array = array_slice( $words_array[0], 0, $num_words + 1 );
$sep = '';
} else {
Expand Down
24 changes: 20 additions & 4 deletions src/wp-includes/functions.php
Original file line number Diff line number Diff line change
Expand Up @@ -360,12 +360,20 @@ function wp_maybe_decline_date( $date, $format = '' ) {
$decline = preg_match( '#[dj]\.? F#', $format );
} else {
// If the format is not passed, try to guess it from the date string.
$decline = preg_match( '#\b\d{1,2}\.? [^\d ]+\b#u', $date );
if ( _wp_can_use_pcre_u() ) {
$decline = preg_match( '#\b\d{1,2}\.? [^\d ]+\b#u', $date );
} else {
$decline = preg_match( '#\b\d{1,2}\.? [^\d ]+\b#', $date );
}
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what is the difference between the \b with and without the UTF-8 flag?

}

if ( $decline ) {
foreach ( $months as $key => $month ) {
$months[ $key ] = '# ' . preg_quote( $month, '#' ) . '\b#u';
if ( _wp_can_use_pcre_u() ) {
$months[ $key ] = '# ' . preg_quote( $month, '#' ) . '\b#u';
} else {
$months[ $key ] = '# ' . preg_quote( $month, '#' ) . '\b#';
}
}

foreach ( $months_genitive as $key => $month ) {
Expand All @@ -383,12 +391,20 @@ function wp_maybe_decline_date( $date, $format = '' ) {
$decline = preg_match( '#F [dj]#', $format );
} else {
// If the format is not passed, try to guess it from the date string.
$decline = preg_match( '#\b[^\d ]+ \d{1,2}(st|nd|rd|th)?\b#u', trim( $date ) );
if ( _wp_can_use_pcre_u() ) {
$decline = preg_match( '#\b[^\d ]+ \d{1,2}(st|nd|rd|th)?\b#u', trim( $date ) );
} else {
$decline = preg_match( '#\b[^\d ]+ \d{1,2}(st|nd|rd|th)?\b#', trim( $date ) );
}
}

if ( $decline ) {
foreach ( $months as $key => $month ) {
$months[ $key ] = '#\b' . preg_quote( $month, '#' ) . ' (\d{1,2})(st|nd|rd|th)?([-–]\d{1,2})?(st|nd|rd|th)?\b#u';
if ( _wp_can_use_pcre_u() ) {
$months[ $key ] = '#\b' . preg_quote( $month, '#' ) . ' (\d{1,2})(st|nd|rd|th)?([-–]\d{1,2})?(st|nd|rd|th)?\b#u';
} else {
$months[ $key ] = '#\b' . preg_quote( $month, '#' ) . ' (\d{1,2})(st|nd|rd|th)?([-–]\d{1,2})?(st|nd|rd|th)?\b#';
}
}

foreach ( $months_genitive as $key => $month ) {
Expand Down
28 changes: 27 additions & 1 deletion src/wp-includes/link-template.php
Original file line number Diff line number Diff line change
Expand Up @@ -4564,7 +4564,33 @@ function get_avatar_data( $id_or_email, $args = null ) {
}

if ( ! empty( $name ) ) {
if ( preg_match( '/\p{Han}|\p{Hiragana}|\p{Katakana}|\p{Hangul}/u', $name ) || false === strpos( $name, ' ' ) ) {
$is_cjk = false;

if ( _wp_can_use_pcre_u() ) {
$is_cjk = preg_match( '/\p{Han}|\p{Hiragana}|\p{Katakana}|\p{Hangul}/u', $name );
} elseif ( class_exists( 'IntlChar' ) ) {
$first_char = mb_substr( $name, 0, 1, 'UTF-8' );
$codepoint = IntlChar::ord( $first_char );

if ( null !== $codepoint ) {
$block = IntlChar::getBlockCode( $codepoint );
$cjk_blocks = array(
IntlChar::BLOCK_CODE_CJK_UNIFIED_IDEOGRAPHS,
IntlChar::BLOCK_CODE_HANGUL_SYLLABLES,
IntlChar::BLOCK_CODE_HIRAGANA,
IntlChar::BLOCK_CODE_KATAKANA,
IntlChar::BLOCK_CODE_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A,
IntlChar::BLOCK_CODE_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B,
IntlChar::BLOCK_CODE_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_C,
IntlChar::BLOCK_CODE_CJK_UNIFIED_IDEOGRAPHS_EXTENSION_D,
IntlChar::BLOCK_CODE_HANGUL_JAMO,
IntlChar::BLOCK_CODE_HANGUL_COMPATIBILITY_JAMO,
);
$is_cjk = in_array( $block, $cjk_blocks, true );
}
}

if ( $is_cjk || false === strpos( $name, ' ' ) ) {
$initials = mb_substr( $name, 0, min( 2, mb_strlen( $name, 'UTF-8' ) ), 'UTF-8' );
} else {
$first = mb_substr( $name, 0, 1, 'UTF-8' );
Expand Down
8 changes: 7 additions & 1 deletion src/wp-includes/pomo/po.php
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,13 @@
$unpoified = '';
$previous_is_backslash = false;
foreach ( $lines as $line ) {
preg_match_all( '/./u', $line, $chars );
if ( _wp_can_use_pcre_u() ) {
preg_match_all( '/./u', $line, $chars );
} elseif ( function_exists( 'mb_str_split' ) ) {
$chars = array( mb_str_split( $line, 1, 'UTF-8' ) );

Check failure on line 161 in src/wp-includes/pomo/po.php

View workflow job for this annotation

Image GitHub Actions / Check PHP compatibility / Run compatibility checks

The function mb_str_split() is not present in PHP version 7.3 or earlier
} else {
$chars = array( str_split( $line ) );
}
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this splitting of lines into characters and looking for a backslash is something we can probably do away with entirely: a streaming approach with strpos( '\\' ) would suffice because all of the escapes are US-ASCII. this means we don’t need to split the lines and we don’t need a million string concatenations.

$chars = $chars[0];
foreach ( $chars as $char ) {
if ( ! $previous_is_backslash ) {
Expand Down
6 changes: 5 additions & 1 deletion src/wp-includes/shortcodes.php
Original file line number Diff line number Diff line change
Expand Up @@ -613,7 +613,11 @@ function get_shortcode_atts_regex() {
function shortcode_parse_atts( $text ) {
$atts = array();
$pattern = get_shortcode_atts_regex();
$text = preg_replace( "/[\x{00a0}\x{200b}]+/u", ' ', $text );
if ( _wp_can_use_pcre_u() ) {
$text = preg_replace( "/[\x{00a0}\x{200b}]+/u", ' ', $text );
} else {
$text = str_replace( array( "\xc2\xa0", "\xe2\x80\x8b" ), ' ', $text );
}
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if the regex isn’t necessary, it would seem fine to replace it directly with the str_replace(), but two ideas:

  • use strtr( $text, array( … ) )
  • use the Unicode string literals like "\u{00A0}" and "\u{200B}"

although it would be good to verify that all supported versions of PHP support that Unicode syntax without any extensions. I think they do.

if ( preg_match_all( $pattern, $text, $match, PREG_SET_ORDER ) ) {
foreach ( $match as $m ) {
if ( ! empty( $m[1] ) ) {
Expand Down
Loading