libdav/
encoding.rs

Help
// Copyright 2024 Hugo Osvaldo Barrera
//
// SPDX-License-Identifier: ISC

//! Functions for encoding and decoding percent-encoded path components.
//!
//! libdav follows the same convention, and so should dependant software:
//!
//! - An `Uri` has its path component percent-encoded. The `Uri` type enforces validation of this.
//!   Additionally, all `Uri` instances produced by this library have their component normalised with
//!   [`strict_percent_encoded`].
//! - `href` strings shall have only reserved characters percent-encoded. This makes them better
//!    suitable for display, and for usage in filenames. These are frequently encoded into XML, so
//!    do not require escaping non-reserved characters. Handling of XML entities is done internally
//!    by libdav; consumers need not concern themselves with this.
//!
//! Hrefs should be treated as opaque strings. These MAY be percent-encoded, but keep in mind that
//! decoding them and re-encoding them may not yield the same result. From [RFC3986, section
//! 2.2](https://www.rfc-editor.org/rfc/rfc3986#section-2.2):
//!
//! > URIs that differ in the replacement of a reserved character with its corresponding
//! > percent-encoded octet are not equivalent.  Percent- encoding a reserved character, or
//! > decoding a percent-encoded octet that corresponds to a reserved character, will change how
//! > the URI is interpreted by most applications.  Thus, characters in the reserved set are
//! > protected from normalization and are therefore safe to be used by scheme-specific and
//! > producer-specific algorithms for delimiting data subcomponents within a URI.
//!
//! See [this short article](https://whynothugo.nl/journal/2024/12/27/urls-and-percent-encoding/)
//! for more details.

use std::borrow::Cow;

/// Returned when invalid input is provided to [`normalise_percent_encoded`].
#[derive(Debug, thiserror::Error, PartialEq)]
pub enum NormalisationError {
    /// Unexpected end of string after percent sign.
    #[error("Unexpected end of string after percent sign.")]
    TruncatedPercent,
    /// Non-hexadecimal digits after percent sign.
    #[error("Non-hexadecimal digits after percent sign.")]
    NonHex,
    /// Invalid sequence after percent sign.
    #[error("Invalid sequence after percent sign: {0:x}.")]
    InvalidPercent(u32),
}

/// Normalise a percent encoded path.
///
/// Reserved characters shall remain percent encoded, but their hexadecimal representation
/// normalised to uppercase. All other characters shall be decoded.
///
/// # Errors
///
/// Returns [`NormalisationError`] if the input string contains invalid percent-encoded data.
pub fn normalise_percent_encoded(input: &str) -> Result<Cow<str>, NormalisationError> {
    let mut result = String::new();
    let mut last_pos = 0;
    let mut chars = input.char_indices();

    while let Some((i, ch)) = chars.next() {
        if ch == '%' {
            let (Some((_, h)), Some((_, l))) = (chars.next(), chars.next()) else {
                return Err(NormalisationError::TruncatedPercent);
            };
            let (Some(high), Some(low)) = (h.to_digit(16), l.to_digit(16)) else {
                return Err(NormalisationError::NonHex);
            };
            let hex = high * 16 + low;
            let decoded_char =
                char::from_u32(hex).ok_or(NormalisationError::InvalidPercent(hex))?;

            result.push_str(&input[last_pos..i]);
            match decoded_char {
                ':' | '/' | '?' | '#' | '[' | ']' | '@' | '!' | '$' | '&' | '\'' | '(' | ')'
                | '*' | '+' | ',' | ';' | '=' => {
                    // Reserved: Retain in encoded form with uppercase hex
                    result.push('%');
                    result.push(h.to_ascii_uppercase());
                    result.push(l.to_ascii_uppercase());
                }
                _ => result.push(decoded_char),
            }
            last_pos = i + 3;
        }
    }

    if last_pos == 0 {
        Ok(Cow::Borrowed(input))
    } else {
        result.push_str(&input[last_pos..]);
        Ok(Cow::Owned(result))
    }
}

/// Percent-encode most characters in a path.
///
/// The input string may contain percent-encoded characters. All reserved characters which are
/// percent-encoded shall be left untouched. All unreserved characters remain intact. All other
/// non-reserved characters shall be percent-encoded.
///
/// ```ignore
/// reserved    = gen-delims / sub-delims
/// gen-delims  = ":" / "/" / "?" / "#" / "[" / "]" / "@"
/// sub-delims  = "!" / "$" / "&" / "'" / "(" / ")"
///              / "*" / "+" / "," / ";" / "="
///
/// unreserved  = ALPHA / DIGIT / "-" / "." / "_" / "~"
/// ```
///
/// The resulting string is not display-friendly, but it suitable for use as path in an HTTP
/// request.
///
/// Calling this function with an input where ANY non-reserved character has already been escape
/// will produce a double-encoded output. This output would point to a resource different than the
/// original input.
#[must_use]
pub fn strict_percent_encoded(input: &str) -> Cow<str> {
    let mut result = String::new();
    let mut last_index = 0;

    for (i, ch) in input.char_indices() {
        match ch {
            #[rustfmt::skip]
            ':' | '/' | '?' | '#' | '[' | ']' | '@' |
            '!' | '$' | '&' | '\'' | '(' | ')' |
            '*' | '+' | ',' | ';' | '=' |
            'a'..='z' | 'A'..='Z' | '0'..='9' | '-' | '.' | '_' | '~' |
            '%' => {} // No need to encode.
            _ => {
                result.push_str(&input[last_index..i]);
                for byte in ch.to_string().as_bytes() {
                    result.push('%');
                    result.push_str(&format!("{byte:02X}"));
                }
                last_index = i + ch.len_utf8();
            }
        };
    }

    if result.is_empty() {
        Cow::Borrowed(input)
    } else {
        result.push_str(&input[last_index..]);
        Cow::Owned(result)
    }
}

#[cfg(test)]
mod test {
    use super::{normalise_percent_encoded, strict_percent_encoded, NormalisationError};

    // Tests for normalise_percent_encoded

    #[test]
    fn normalise_percent_encoded_valid_percent_encoding() {
        let input = "%41";
        let expected = "A";
        assert_eq!(normalise_percent_encoded(input).unwrap(), expected);
    }

    #[test]
    fn normalise_percent_encoded_invalid_hex_characters() {
        let input = "%4G";
        let expected = NormalisationError::NonHex;
        assert_eq!(normalise_percent_encoded(input).unwrap_err(), expected);
    }

    #[test]
    fn normalise_percent_encoded_incomplete_percent_encoding() {
        let input = "%4";
        let expected = NormalisationError::TruncatedPercent;
        assert_eq!(normalise_percent_encoded(input).unwrap_err(), expected);
    }

    #[test]
    fn normalise_percent_encoded_trailing_percent() {
        let input = "hello%";
        let expected = NormalisationError::TruncatedPercent;
        assert_eq!(normalise_percent_encoded(input).unwrap_err(), expected);
    }

    #[test]
    fn normalise_percent_encoded_unencoded_reserved_character() {
        let input = "hello/";
        let expected = input;
        assert_eq!(normalise_percent_encoded(input).unwrap(), expected);
    }

    #[test]
    fn normalise_percent_encoded_reserved_character() {
        let input = "hello%2f";
        let expected = "hello%2F";
        assert_eq!(normalise_percent_encoded(input).unwrap(), expected);
    }

    #[test]
    fn normalise_percent_encoded_empty_string() {
        let input = "";
        let expected = "";
        assert_eq!(normalise_percent_encoded(input).unwrap(), expected);
    }

    // Tests for strict_percent_encoded

    #[test]
    fn strict_percent_encoded_reserved_characters() {
        let input = ":/?#[]@!$&'()*+,;=";
        let expected = input;
        assert_eq!(strict_percent_encoded(input), expected);
    }

    #[test]
    fn strict_percent_encoded_unreserved_characters() {
        let input = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-._~";
        let expected = input;
        assert_eq!(strict_percent_encoded(input), expected);
    }

    #[test]
    fn strict_percent_encoded_percent_encoded_characters() {
        let input = "%20%2F%3F";
        let expected = input;
        assert_eq!(strict_percent_encoded(input), expected);
    }

    #[test]
    fn strict_percent_encoded_multibyte_characters() {
        let input = "こんにちは";
        let expected = "%E3%81%93%E3%82%93%E3%81%AB%E3%81%A1%E3%81%AF";
        assert_eq!(strict_percent_encoded(input), expected);
    }

    #[test]
    fn strict_percent_encoded_german_special_characters() {
        let input = "Grüße aus Köln!";
        let expected = "Gr%C3%BC%C3%9Fe%20aus%20K%C3%B6ln!";
        assert_eq!(strict_percent_encoded(input), expected);
    }

    #[test]
    fn strict_percent_encoded_emoji() {
        let input = "😀🔥";
        let expected = "%F0%9F%98%80%F0%9F%94%A5";
        assert_eq!(strict_percent_encoded(input), expected);
    }

    #[test]
    fn strict_percent_encoded_mixed_characters() {
        let input = "Hello:/World%20😀";
        let expected = "Hello:/World%20%F0%9F%98%80";
        assert_eq!(strict_percent_encoded(input), expected);
    }

    #[test]
    fn strict_percent_encoded_tilde() {
        let input = "~";
        let expected = input;
        assert_eq!(strict_percent_encoded(input), expected);
    }
}
libdav/encoding.rs

libdav/
encoding.rs