Skip to content

Commit 3a27ae5

Browse files
authored
feat: move text encoding functionality into deno_graph::source from CLI (#370)
1 parent 19e2b28 commit 3a27ae5

File tree

2 files changed

+207
-9
lines changed

2 files changed

+207
-9
lines changed

src/source/mod.rs

+187-9
Original file line numberDiff line numberDiff line change
@@ -306,20 +306,71 @@ pub trait NpmResolver: fmt::Debug {
306306
pub fn load_data_url(
307307
specifier: &ModuleSpecifier,
308308
) -> Result<Option<LoadResponse>, anyhow::Error> {
309-
let url = DataUrl::process(specifier.as_str())
310-
.map_err(|_| anyhow!("Unable to decode data url."))?;
311-
let (bytes, _) = url
312-
.decode_to_vec()
313-
.map_err(|_| anyhow!("Unable to decode data url."))?;
314-
let mut headers: HashMap<String, String> = HashMap::with_capacity(1);
315-
headers.insert("content-type".to_string(), url.mime_type().to_string());
309+
let data_url = RawDataUrl::parse(specifier)?;
310+
let (bytes, headers) = data_url.into_bytes_and_headers();
316311
Ok(Some(LoadResponse::Module {
317312
specifier: specifier.clone(),
318313
maybe_headers: Some(headers),
319314
content: Arc::from(bytes),
320315
}))
321316
}
322317

318+
#[derive(Debug, Clone)]
319+
pub struct RawDataUrl {
320+
pub mime_type: String,
321+
pub bytes: Vec<u8>,
322+
}
323+
324+
impl RawDataUrl {
325+
pub fn parse(specifier: &ModuleSpecifier) -> Result<Self, Error> {
326+
let url = DataUrl::process(specifier.as_str())
327+
.map_err(|_| anyhow!("Unable to decode data url."))?;
328+
let (bytes, _) = url
329+
.decode_to_vec()
330+
.map_err(|_| anyhow!("Unable to decode data url."))?;
331+
Ok(RawDataUrl {
332+
mime_type: url.mime_type().to_string(),
333+
bytes,
334+
})
335+
}
336+
337+
pub fn charset(&self) -> Option<&str> {
338+
get_mime_type_charset(&self.mime_type)
339+
}
340+
341+
pub fn media_type(&self) -> MediaType {
342+
let mut content_types = self.mime_type.split(';');
343+
let Some(content_type) = content_types.next() else {
344+
return MediaType::Unknown;
345+
};
346+
MediaType::from_content_type(
347+
// this data url will be ignored when resolving the MediaType
348+
// as in this rare case the MediaType is determined solely based
349+
// on the provided content type
350+
&ModuleSpecifier::parse("data:image/png;base64,").unwrap(),
351+
content_type,
352+
)
353+
}
354+
355+
pub fn decode(self) -> Result<String, std::io::Error> {
356+
let charset = get_mime_type_charset(&self.mime_type).unwrap_or("utf-8");
357+
decode_owned_source_with_charset(self.bytes, charset)
358+
}
359+
360+
pub fn into_bytes_and_headers(self) -> (Vec<u8>, HashMap<String, String>) {
361+
let headers = HashMap::from([("content-type".to_string(), self.mime_type)]);
362+
(self.bytes, headers)
363+
}
364+
}
365+
366+
fn get_mime_type_charset(mime_type: &str) -> Option<&str> {
367+
mime_type
368+
.split(';')
369+
.skip(1)
370+
.map(str::trim)
371+
.find_map(|s| s.strip_prefix("charset="))
372+
}
373+
323374
/// An implementation of the loader attribute where the responses are provided
324375
/// ahead of time. This is useful for testing or
325376
#[derive(Default)]
@@ -500,8 +551,10 @@ pub fn resolve_media_type_and_charset_from_content_type<'a>(
500551
) -> (MediaType, Option<&'a str>) {
501552
if let Some(content_type) = maybe_content_type {
502553
let mut content_types = content_type.split(';');
503-
let content_type = content_types.next().unwrap();
504-
let media_type = MediaType::from_content_type(specifier, content_type);
554+
let media_type = content_types
555+
.next()
556+
.map(|content_type| MediaType::from_content_type(specifier, content_type))
557+
.unwrap_or(MediaType::Unknown);
505558
let charset = content_types
506559
.map(str::trim)
507560
.find_map(|s| s.strip_prefix("charset="));
@@ -512,6 +565,54 @@ pub fn resolve_media_type_and_charset_from_content_type<'a>(
512565
}
513566
}
514567

568+
/// Decodes the source bytes into a string handling any encoding rules
569+
/// where the bytes may be from a remote module, file module, or other.
570+
pub fn decode_owned_source(
571+
specifier: &ModuleSpecifier,
572+
bytes: Vec<u8>,
573+
maybe_charset: Option<&str>,
574+
) -> Result<String, std::io::Error> {
575+
let charset = maybe_charset.unwrap_or_else(|| {
576+
if specifier.scheme() == "file" {
577+
text_encoding::detect_charset(&bytes)
578+
} else {
579+
"utf-8"
580+
}
581+
});
582+
decode_owned_source_with_charset(bytes, charset)
583+
}
584+
585+
/// Decodes the source bytes into a string handling any encoding rules
586+
/// where the source is a `file:` specifier.
587+
pub fn decode_owned_file_source(
588+
bytes: Vec<u8>,
589+
) -> Result<String, std::io::Error> {
590+
let charset = text_encoding::detect_charset(&bytes);
591+
decode_owned_source_with_charset(bytes, charset)
592+
}
593+
594+
fn decode_owned_source_with_charset(
595+
bytes: Vec<u8>,
596+
charset: &str,
597+
) -> Result<String, std::io::Error> {
598+
match text_encoding::convert_to_utf8(&bytes, charset)? {
599+
Cow::Borrowed(text) => {
600+
if text.starts_with(text_encoding::BOM_CHAR) {
601+
Ok(text[text_encoding::BOM_CHAR.len_utf8()..].to_string())
602+
} else {
603+
Ok(
604+
// SAFETY: we know it's a valid utf-8 string at this point
605+
unsafe { String::from_utf8_unchecked(bytes) },
606+
)
607+
}
608+
}
609+
Cow::Owned(mut text) => {
610+
text_encoding::strip_bom_mut(&mut text);
611+
Ok(text)
612+
}
613+
}
614+
}
615+
515616
/// Decodes the source bytes into a string handling any encoding rules
516617
/// for local vs remote files and dealing with the charset.
517618
pub fn decode_source(
@@ -828,4 +929,81 @@ pub mod tests {
828929
);
829930
}
830931
}
932+
933+
#[test]
934+
fn test_parse_valid_data_url() {
935+
let valid_data_url = "data:text/plain;base64,SGVsbG8sIFdvcmxkIQ==";
936+
let specifier = ModuleSpecifier::parse(valid_data_url).unwrap();
937+
let raw_data_url = RawDataUrl::parse(&specifier).unwrap();
938+
assert_eq!(raw_data_url.mime_type, "text/plain");
939+
assert_eq!(raw_data_url.bytes, b"Hello, World!");
940+
}
941+
942+
#[test]
943+
fn test_charset_with_valid_mime_type() {
944+
let raw_data_url = RawDataUrl {
945+
mime_type: "text/plain; charset=utf-8".to_string(),
946+
bytes: vec![],
947+
};
948+
assert_eq!(raw_data_url.charset(), Some("utf-8"));
949+
}
950+
951+
#[test]
952+
fn test_charset_with_no_charset_in_mime_type() {
953+
let raw_data_url = RawDataUrl {
954+
mime_type: "text/plain".to_string(),
955+
bytes: vec![],
956+
};
957+
assert_eq!(raw_data_url.charset(), None);
958+
}
959+
960+
#[test]
961+
fn test_media_type_with_known_type() {
962+
let raw_data_url = RawDataUrl {
963+
mime_type: "application/javascript;charset=utf-8".to_string(),
964+
bytes: vec![],
965+
};
966+
assert_eq!(raw_data_url.media_type(), MediaType::JavaScript);
967+
}
968+
969+
#[test]
970+
fn test_media_type_with_unknown_type() {
971+
let raw_data_url = RawDataUrl {
972+
mime_type: "unknown/unknown".to_string(),
973+
bytes: vec![],
974+
};
975+
assert_eq!(raw_data_url.media_type(), MediaType::Unknown);
976+
}
977+
978+
#[test]
979+
fn test_decode_with_valid_charset() {
980+
let raw_data_url = RawDataUrl {
981+
mime_type: "text/plain; charset=utf-8".to_string(),
982+
bytes: "Hello, World!".as_bytes().to_vec(),
983+
};
984+
assert_eq!(raw_data_url.decode().unwrap(), "Hello, World!");
985+
}
986+
987+
#[test]
988+
fn test_decode_with_invalid_charset() {
989+
let raw_data_url = RawDataUrl {
990+
mime_type: "text/plain; charset=invalid-charset".to_string(),
991+
bytes: vec![],
992+
};
993+
assert!(raw_data_url.decode().is_err());
994+
}
995+
996+
#[test]
997+
fn test_into_bytes_and_headers() {
998+
let raw_data_url = RawDataUrl {
999+
mime_type: "text/plain; charset=utf-8".to_string(),
1000+
bytes: "Hello, World!".as_bytes().to_vec(),
1001+
};
1002+
let (bytes, headers) = raw_data_url.into_bytes_and_headers();
1003+
assert_eq!(bytes, "Hello, World!".as_bytes());
1004+
assert_eq!(
1005+
headers.get("content-type").unwrap(),
1006+
"text/plain; charset=utf-8"
1007+
);
1008+
}
8311009
}

src/text_encoding.rs

+20
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,8 @@ pub fn strip_bom_mut(text: &mut String) {
4848

4949
#[cfg(test)]
5050
mod test {
51+
use std::io::ErrorKind;
52+
5153
use super::*;
5254

5355
fn test_detection(test_data: &[u8], expected_charset: &str) {
@@ -91,4 +93,22 @@ mod test {
9193
strip_bom_mut(&mut text);
9294
assert_eq!(text, "text");
9395
}
96+
97+
#[test]
98+
fn test_decoding_unsupported_charset() {
99+
let test_data = Vec::new();
100+
let result = convert_to_utf8(&test_data, "utf-32le");
101+
assert!(result.is_err());
102+
let err = result.expect_err("Err expected");
103+
assert!(err.kind() == ErrorKind::InvalidInput);
104+
}
105+
106+
#[test]
107+
fn test_decoding_invalid_utf8() {
108+
let test_data = b"\xFE\xFE\xFF\xFF".to_vec();
109+
let result = convert_to_utf8(&test_data, "utf-8");
110+
assert!(result.is_err());
111+
let err = result.expect_err("Err expected");
112+
assert!(err.kind() == ErrorKind::InvalidData);
113+
}
94114
}

0 commit comments

Comments
 (0)