diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..ea8c4bf --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +/target diff --git a/.vscode/launch.json b/.vscode/launch.json new file mode 100644 index 0000000..ff9a9a9 --- /dev/null +++ b/.vscode/launch.json @@ -0,0 +1,45 @@ +{ + // Use IntelliSense to learn about possible attributes. + // Hover to view descriptions of existing attributes. + // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 + "version": "0.2.0", + "configurations": [ + { + "type": "lldb", + "request": "launch", + "name": "Debug executable 'doc-parser'", + "cargo": { + "args": [ + "build", + "--bin=doc-parser", + "--package=doc-parser" + ], + "filter": { + "name": "doc-parser", + "kind": "bin" + } + }, + "args": [], + "cwd": "${workspaceFolder}" + }, + { + "type": "lldb", + "request": "launch", + "name": "Debug unit tests in executable 'doc-parser'", + "cargo": { + "args": [ + "test", + "--no-run", + "--bin=doc-parser", + "--package=doc-parser" + ], + "filter": { + "name": "doc-parser", + "kind": "bin" + } + }, + "args": [], + "cwd": "${workspaceFolder}" + } + ] +} \ No newline at end of file diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..b47aa25 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,9 @@ +{ + "spellright.language": [ + "nl" + ], + "spellright.documentTypes": [ + "latex", + "plaintext" + ] +} \ No newline at end of file diff --git a/Cargo.lock b/Cargo.lock new file mode 100644 index 0000000..fc67eff --- /dev/null +++ b/Cargo.lock @@ -0,0 +1,248 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 3 + +[[package]] +name = "adler" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" + +[[package]] +name = "arbitrary" +version = "1.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7d5a26814d8dcb93b0e5a0ff3c6d80a8843bafb21b39e8e18a6f05471870e110" +dependencies = [ + "derive_arbitrary", +] + +[[package]] +name = "bitflags" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cf4b9d6a944f767f8e5e0db018570623c85f3d925ac718db4e06d0187adb21c1" + +[[package]] +name = "byteorder" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" + +[[package]] +name = "cfg-if" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" + +[[package]] +name = "convert_case" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6245d59a3e82a7fc217c5828a6692dbc6dfb63a0c8c90495621f7b9d79704a0e" + +[[package]] +name = "crc32fast" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b3855a8a784b474f333699ef2bbca9db2c4a1f6d9088a90a2d25b1eb53111eaa" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "crossbeam-utils" +version = "0.8.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "248e3bacc7dc6baa3b21e405ee045c3047101a49145e7e9eca583ab4c2ca5345" + +[[package]] +name = "derive_arbitrary" +version = "1.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "67e77553c4162a157adbf834ebae5b415acbecbeafc7a74b0e886657506a7611" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.60", +] + +[[package]] +name = "derive_more" +version = "0.99.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4fb810d30a7c1953f91334de7244731fc3f3c10d7fe163338a35b9f640960321" +dependencies = [ + "convert_case", + "proc-macro2", + "quote", + "rustc_version", + "syn 1.0.109", +] + +[[package]] +name = "doc-parser" +version = "0.1.0" +dependencies = [ + "docx-rust", +] + +[[package]] +name = "docx-rust" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "895c3d3886569f4b8ae770b29a5fca9ac31ccdfbf91c5ff006d93c74449367c4" +dependencies = [ + "derive_more", + "hard-xml", + "log", + "zip", +] + +[[package]] +name = "flate2" +version = "1.0.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5f54427cfd1c7829e2a139fcefea601bf088ebca651d2bf53ebc600eac295dae" +dependencies = [ + "crc32fast", + "miniz_oxide", +] + +[[package]] +name = "hard-xml" +version = "1.36.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a344e0cef8802f37dc47f17c01a04354d3e66d9f6c8744108b0912f616efe266" +dependencies = [ + "hard-xml-derive", + "jetscii", + "lazy_static", + "memchr", + "xmlparser", +] + +[[package]] +name = "hard-xml-derive" +version = "1.36.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1bfae7cdfe23e50ea96929ccf1948d9ae1d8608353556461e5de247463d3a4f6" +dependencies = [ + "bitflags", + "proc-macro2", + "quote", + "syn 1.0.109", +] + +[[package]] +name = "jetscii" +version = "0.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "47f142fe24a9c9944451e8349de0a56af5f3e7226dc46f3ed4d4ecc0b85af75e" + +[[package]] +name = "lazy_static" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" + +[[package]] +name = "log" +version = "0.4.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "90ed8c1e510134f979dbc4f070f87d4313098b704861a105fe34231c70a3901c" + +[[package]] +name = "memchr" +version = "2.7.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6c8640c5d730cb13ebd907d8d04b52f55ac9a2eec55b440c8892f40d56c76c1d" + +[[package]] +name = "miniz_oxide" +version = "0.7.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9d811f3e15f28568be3407c8e7fdb6514c1cda3cb30683f15b6a1a1dc4ea14a7" +dependencies = [ + "adler", +] + +[[package]] +name = "proc-macro2" +version = "1.0.81" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3d1597b0c024618f09a9c3b8655b7e430397a36d23fdafec26d6965e9eec3eba" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "quote" +version = "1.0.36" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0fa76aaf39101c457836aec0ce2316dbdc3ab723cdda1c6bd4e6ad4208acaca7" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "rustc_version" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bfa0f585226d2e68097d4f95d113b15b83a82e819ab25717ec0590d9584ef366" +dependencies = [ + "semver", +] + +[[package]] +name = "semver" +version = "1.0.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "92d43fe69e652f3df9bdc2b85b2854a0825b86e4fb76bc44d945137d053639ca" + +[[package]] +name = "syn" +version = "1.0.109" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "syn" +version = "2.0.60" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "909518bc7b1c9b779f1bbf07f2929d35af9f0f37e47c6e9ef7f9dddc1e1821f3" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "unicode-ident" +version = "1.0.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b" + +[[package]] +name = "xmlparser" +version = "0.13.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "66fee0b777b0f5ac1c69bb06d361268faafa61cd4682ae064a171c16c433e9e4" + +[[package]] +name = "zip" +version = "1.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2e6cb8909b2e8e6733c9ef67d35be1a27105644d362aafb5f8b2ba395727adf6" +dependencies = [ + "arbitrary", + "byteorder", + "crc32fast", + "crossbeam-utils", + "flate2", +] diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..76c20ab --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,11 @@ +[package] +name = "doc-parser" +version = "0.1.0" +edition = "2021" + +[dependencies] +docx-rust = "0.1.7" +# docx-rust = { git="https://github.com/erikvullings/docx-rs.git" } + + + diff --git a/convert_to_markdown.sh b/convert_to_markdown.sh new file mode 100755 index 0000000..aabc9b8 --- /dev/null +++ b/convert_to_markdown.sh @@ -0,0 +1,23 @@ +#!/bin/bash + +# Specify the input folder containing the .docx files +input_folder="./test" + +# Specify the media folder where media files will be extracted +media_folder="${input_folder}" + +# Loop through all .docx files in the input folder +for file in "$input_folder"/*.docx; do + # Check if the file is a regular file + if [ -f "$file" ]; then + # Extract the file name without extension + filename=$(basename -- "$file") + filename_no_ext="${filename%.*}" + + # Convert the .docx file to Markdown using Pandoc. See also https://stackoverflow.com/a/74654058/319711 for heading style + pandoc -s "$file" --wrap=none --reference-links --atx-headers --extract-media="$input_folder" -t markdown -o "${input_folder}/${filename_no_ext}.md" + + echo "Converted $file to Markdown." + fi +done + diff --git a/example.docx b/example.docx new file mode 100644 index 0000000..555bb28 Binary files /dev/null and b/example.docx differ diff --git a/src/main.rs b/src/main.rs new file mode 100644 index 0000000..89442c3 --- /dev/null +++ b/src/main.rs @@ -0,0 +1,440 @@ +use std::collections::HashMap; + +use docx_rust::document::BodyContent::{Paragraph, Sdt, SectionProperty, Table, TableCell}; +use docx_rust::document::{ParagraphContent, RunContent}; +use docx_rust::formatting::ParagraphProperty; +use docx_rust::styles::StyleType; +use docx_rust::DocxFile; + +#[derive(Debug, PartialEq, Eq, Clone)] +pub struct BlockStyle { + pub bold: bool, + pub italics: bool, + pub underline: bool, + pub strike: bool, + /// Size is specified in points x 2, so size 19 is equal to 9.5pt + pub size: Option, +} + +impl BlockStyle { + pub fn new() -> Self { + BlockStyle { + bold: false, + italics: false, + underline: false, + strike: false, + size: None, + } + } + + pub fn combine_with(&mut self, other: &BlockStyle) { + self.bold = other.bold; + self.italics = other.italics; + self.underline = other.underline; + self.strike = other.strike; + if let Some(size) = other.size { + self.size = Some(size); + } + } +} + +#[derive(Debug, Clone)] +pub struct ParagraphStyle { + pub style_id: Option, + pub outline_lvl: Option, + pub numbering: Option, + pub page_break_before: Option, + pub style: Option, +} + +impl Default for ParagraphStyle { + fn default() -> Self { + ParagraphStyle::new() + } +} + +impl ParagraphStyle { + pub fn new() -> Self { + ParagraphStyle { + style_id: None, + outline_lvl: None, + numbering: None, + page_break_before: None, + style: None, + } + } + + pub fn combine_with(&mut self, other: &ParagraphStyle) { + self.style_id = self.style_id.clone().or_else(|| other.style_id.clone()); + self.outline_lvl = self.outline_lvl.or_else(|| other.outline_lvl); + self.numbering = self.numbering.or_else(|| other.numbering); + self.page_break_before = self.page_break_before.or_else(|| other.page_break_before); + if let Some(ref mut style) = self.style { + if let Some(ref other_style) = other.style { + style.combine_with(other_style); + } + } else { + self.style = other.style.clone(); + } + } +} + +impl<'a> From<&'a ParagraphProperty<'a>> for ParagraphStyle { + fn from(paragraph_property: &'a ParagraphProperty) -> Self { + // Extract properties from ParagraphProperty and create a new ParagraphStyle + let mut paragraph_style = ParagraphStyle::new(); + if let Some(style_id) = ¶graph_property.style_id { + paragraph_style.style_id = Some(style_id.value.to_string()); + } + if let Some(outline_lvl) = ¶graph_property.outline_lvl { + paragraph_style.outline_lvl = Some(outline_lvl.value); + } + if let Some(page_break_before) = ¶graph_property.page_break_before { + paragraph_style.page_break_before = page_break_before.value; + } + if let Some(_) = paragraph_property.numbering { + paragraph_style.numbering = Some(true); + } + if let Some(character_property) = ¶graph_property.r_pr { + let mut block_style = BlockStyle::new(); + if let Some(size) = &character_property.size { + block_style.size = Some(size.value); + } + if character_property.bold.is_some() { + block_style.bold = true; + } + if character_property.underline.is_some() { + block_style.underline = true; + } + if character_property.italics.is_some() || character_property.emphasis.is_some() { + block_style.italics = true; + } + if character_property.strike.is_some() || character_property.dstrike.is_some() { + block_style.strike = true; + } + paragraph_style.style = Some(block_style); + } + paragraph_style + } +} + +#[derive(Debug, PartialEq, Eq)] +pub struct TextBlock { + pub style: Option, + pub text: String, +} + +impl TextBlock { + pub fn new(text: String, style: Option) -> Self { + TextBlock { style, text } + } + + pub fn to_markdown(&self, paragraph_style: &ParagraphStyle) -> String { + let mut markdown = self.text.clone(); + + let mut style = if self.style.is_some() { + self.style.as_ref().unwrap().clone() + } else { + BlockStyle::new() + }; + + if let Some(block_style) = ¶graph_style.style { + style.combine_with(block_style); + }; + + // Add bold formatting if enabled + if style.bold { + markdown = format!("**{markdown}**"); + } + + // Add italic formatting if enabled + if style.italics { + markdown = format!("*{markdown}*"); + } + + // Add underline formatting if enabled + if style.underline { + markdown = format!("__{markdown}__"); + } + + // Add strike-through formatting if enabled + if style.strike { + markdown = format!("~~{markdown}~~"); + } + markdown + } +} + +#[derive(Debug)] +pub struct MarkdownParagraph { + pub style: Option, + pub blocks: Vec, +} + +impl MarkdownParagraph { + pub fn new() -> Self { + MarkdownParagraph { + style: None, + blocks: vec![], + } + } + + pub fn to_markdown(&self, styles: &HashMap) -> String { + let mut markdown = String::new(); + + let mut style = if self.style.is_some() { + self.style.as_ref().unwrap().clone() + } else { + ParagraphStyle::default() + }; + + if let Some(style_id) = &style.style_id { + if let Some(doc_style) = styles.get(style_id) { + style.combine_with(doc_style); + } + // markdown += &format!("[{}]", style_id); + }; + + // Add outline level if available + if let Some(outline_lvl) = style.outline_lvl { + // Convert outline level to appropriate Markdown heading level + let heading_level = match outline_lvl { + 0 => "# ", + 1 => "## ", + 2 => "### ", + 3 => "#### ", + 4 => "##### ", + _ => "###### ", // Use the smallest heading level for higher levels + }; + markdown += heading_level; + } + + // Add numbering if available + if let Some(numbering) = style.numbering { + if numbering { + markdown += "1. "; // Start numbering from 1 + } + } + + for block in &self.blocks { + markdown += &block.to_markdown(&style); + } + markdown += "\n"; + + // Add page break before if available + // if let Some(page_break_before) = style.page_break_before { + // markdown += &format!("{{page_break_before: {}}}", page_break_before); + // } + + markdown + } +} + +#[derive(Debug)] +pub struct MarkdownDocument { + pub creator: Option, + pub last_editor: Option, + pub company: Option, + pub title: Option, + pub description: Option, + pub subject: Option, + pub keywords: Option, + pub paragraphs: Vec, + pub styles: HashMap, +} + +impl MarkdownDocument { + pub fn new() -> Self { + MarkdownDocument { + creator: None, + last_editor: None, + company: None, + title: None, + description: None, + subject: None, + keywords: None, + paragraphs: vec![], + styles: HashMap::new(), + } + } + + pub fn to_markdown(&self) -> String { + let mut markdown = String::new(); + + if let Some(title) = &self.title { + markdown += &format!("# {}\n\n", title); + } + + for paragraph in &self.paragraphs { + markdown += ¶graph.to_markdown(&self.styles); + markdown += "\n"; + } + + markdown + } +} + +fn main() { + let docx = match DocxFile::from_file("./test/headers.docx") { + Ok(docx_file) => docx_file, + Err(err) => { + panic!("Error processing file: {:?}", err) + } + }; + let docx = match docx.parse() { + Ok(docx) => docx, + Err(err) => { + panic!("Exiting: {:?}", err); + } + }; + + let mut markdown_doc = MarkdownDocument::new(); + + if let Some(app) = docx.app { + if let Some(company) = app.company { + if !company.is_empty() { + markdown_doc.company = Some(company.to_string()); + } + } + } + + if let Some(core) = docx.core { + if let Some(title) = core.title { + if !title.is_empty() { + markdown_doc.title = Some(title.to_string()); + } + } + if let Some(subject) = core.subject { + if !subject.is_empty() { + markdown_doc.subject = Some(subject.to_string()); + } + } + if let Some(keywords) = core.keywords { + if !keywords.is_empty() { + markdown_doc.keywords = Some(keywords.to_string()); + } + } + if let Some(description) = core.description { + if !description.is_empty() { + markdown_doc.description = Some(description.to_string()); + } + } + if let Some(creator) = core.creator { + if !creator.is_empty() { + markdown_doc.creator = Some(creator.to_string()); + } + } + if let Some(last_modified_by) = core.last_modified_by { + if !last_modified_by.is_empty() { + markdown_doc.last_editor = Some(last_modified_by.to_string()); + } + } + } + + for media in docx.media { + println!("Media: {media:?}",); + } + + for style in &docx.styles.styles { + match style.ty { + Some(StyleType::Paragraph) => { + if let Some(paragraph_property) = &style.paragraph { + let paragraph_style: ParagraphStyle = paragraph_property.into(); + markdown_doc + .styles + .insert(style.style_id.to_string(), paragraph_style); + } + } + _ => (), + } + } + + for content in docx.document.body.content { + match content { + Paragraph(paragraph) => { + let mut markdown_paragraph = MarkdownParagraph::new(); + if let Some(paragraph_property) = ¶graph.property { + let paragraph_style: ParagraphStyle = paragraph_property.into(); + markdown_paragraph.style = Some(paragraph_style); + } + for paragraph_content in paragraph.content { + match paragraph_content { + ParagraphContent::Run(run) => { + let block_style = match run.property { + Some(character_property) => { + let mut block_style = BlockStyle::new(); + if let Some(size) = character_property.size { + block_style.size = Some(size.value); + } + if character_property.bold.is_some() { + block_style.bold = true; + } + if character_property.underline.is_some() { + block_style.underline = true; + } + if character_property.italics.is_some() + || character_property.emphasis.is_some() + { + block_style.italics = true; + } + if character_property.strike.is_some() + || character_property.dstrike.is_some() + { + block_style.strike = true; + } + Some(block_style) + } + None => None, + }; + if let Some(text) = + run.content + .into_iter() + .find_map(|run_content| match run_content { + RunContent::Text(text) => Some(text.text.to_string()), + _ => None, + }) + { + let could_extend_text = if let Some(prev_block) = + markdown_paragraph.blocks.last_mut() + { + if prev_block.style == block_style { + prev_block.text.push_str(&text); + true + } else { + false + } + } else { + false + }; + if !could_extend_text { + let text_block = TextBlock::new(text, block_style); + markdown_paragraph.blocks.push(text_block); + } + }; + } + ParagraphContent::Link(link) => { + println!(" Link: {:?}", link); + } + _ => (), + } + } + if markdown_paragraph.blocks.len() > 0 { + markdown_doc.paragraphs.push(markdown_paragraph); + } + } + Table(table) => { + println!("Table: {:?}", table); + } + Sdt(_) => { + // println!("Sdt"); + } + SectionProperty(_sp) => { + // println!("SectionProperty: {:?}", sp); + } + TableCell(tc) => { + println!("TableCell: {:?}", tc); + } + } + } + println!("\n\n{}", markdown_doc.to_markdown()); +} diff --git a/test/block_quotes.docx b/test/block_quotes.docx new file mode 100644 index 0000000..e554b8a Binary files /dev/null and b/test/block_quotes.docx differ diff --git a/test/block_quotes.md b/test/block_quotes.md new file mode 100644 index 0000000..d43c8db --- /dev/null +++ b/test/block_quotes.md @@ -0,0 +1,15 @@ +## Some block quotes, in different ways + +This is the proper way, with a style + +> I don't know why this would be in italics, but so it appears to be on my screen. + +And this is also a proper way, with a different style + +> This is called the Intense Quote style. + +And this is the way that most people do it: + +> I just indented this, so it looks like a block quote. I think this is how most people do block quotes in their documents. + +And back to the normal style. diff --git a/test/codeblock.docx b/test/codeblock.docx new file mode 100644 index 0000000..cfadb7b Binary files /dev/null and b/test/codeblock.docx differ diff --git a/test/codeblock.md b/test/codeblock.md new file mode 100644 index 0000000..bf87d77 --- /dev/null +++ b/test/codeblock.md @@ -0,0 +1,7 @@ +This is some code: + + readDocx :: ReaderOptions + -> B.ByteString + -> Pandoc + +from the beginning of the docx reader. diff --git a/test/comments.docx b/test/comments.docx new file mode 100644 index 0000000..f04dc30 Binary files /dev/null and b/test/comments.docx differ diff --git a/test/comments.md b/test/comments.md new file mode 100644 index 0000000..58ed85f --- /dev/null +++ b/test/comments.md @@ -0,0 +1,7 @@ +I want some text to have a comment on it. + +This is a new paragraph. + +And so is this. + +One more. And this is one with a comment in a comment. diff --git a/test/custom_style_no_reference.docx b/test/custom_style_no_reference.docx new file mode 100644 index 0000000..8efe7d7 Binary files /dev/null and b/test/custom_style_no_reference.docx differ diff --git a/test/custom_style_no_reference.md b/test/custom_style_no_reference.md new file mode 100644 index 0000000..a7b19c0 --- /dev/null +++ b/test/custom_style_no_reference.md @@ -0,0 +1,7 @@ +This is a test of custom-styles. + +Here is something emphasized. And here is something strong. + +One paragraph of text. + +And another paragraph of really cool text. diff --git a/test/custom_style_preserve.docx b/test/custom_style_preserve.docx new file mode 100644 index 0000000..657195d Binary files /dev/null and b/test/custom_style_preserve.docx differ diff --git a/test/custom_style_preserve.md b/test/custom_style_preserve.md new file mode 100644 index 0000000..4f1e059 --- /dev/null +++ b/test/custom_style_preserve.md @@ -0,0 +1,25 @@ +This span[^1] should have a custom style ([link]), but the text after the comma shouldn't, nor should the link. + +The contents of this div should have a custom style, but [this link should not][link]. + +## This header should not have the div's custom style + +> This blockquote should not. + + # This code block should not. + +But this paragraph should.[^2] + +This should have MyInnerStyle. + +### This heading should not + +This should have MyOuterStyle, but the following elision should have its own style. \... + +> This blockquote should include **bold text with an elision: \...** + +[^1]: Neither footnote nor footnote reference should get a custom style from its span. + +[^2]: Neither footnote nor footnote reference should get a custom style from its div. + + [link]: http://example.com/ diff --git a/test/custom_style_reference.docx b/test/custom_style_reference.docx new file mode 100644 index 0000000..6037ef7 Binary files /dev/null and b/test/custom_style_reference.docx differ diff --git a/test/custom_style_reference.md b/test/custom_style_reference.md new file mode 100644 index 0000000..9a605f0 --- /dev/null +++ b/test/custom_style_reference.md @@ -0,0 +1,7 @@ +This is a test of custom-styles. + +Here is something *emphasized*. And here is something **strong**. + +> One paragraph of text. +> +> And another paragraph of *really cool* text. diff --git a/test/definition_list.docx b/test/definition_list.docx new file mode 100644 index 0000000..39ab03e Binary files /dev/null and b/test/definition_list.docx differ diff --git a/test/definition_list.md b/test/definition_list.md new file mode 100644 index 0000000..30e4fd2 --- /dev/null +++ b/test/definition_list.md @@ -0,0 +1,16 @@ +Term 1 + +: Definition 1 + +Term 2 with *inline markup* + +: Definition 2 + +```{=html} + +``` + { some code, part of Definition 2 } + +::: {.Definition} +Third paragraph of definition 2. +::: diff --git a/test/document-properties-short-desc.docx b/test/document-properties-short-desc.docx new file mode 100644 index 0000000..a937981 Binary files /dev/null and b/test/document-properties-short-desc.docx differ diff --git a/test/document-properties-short-desc.md b/test/document-properties-short-desc.md new file mode 100644 index 0000000..4db6b0a --- /dev/null +++ b/test/document-properties-short-desc.md @@ -0,0 +1,6 @@ +--- +author: A. M. +title: Testing custom properties +--- + +Testing document properties diff --git a/test/document-properties.docx b/test/document-properties.docx new file mode 100644 index 0000000..075c6d6 Binary files /dev/null and b/test/document-properties.docx differ diff --git a/test/document-properties.md b/test/document-properties.md new file mode 100644 index 0000000..1833636 --- /dev/null +++ b/test/document-properties.md @@ -0,0 +1,11 @@ +--- +author: A. M. +subtitle: This is a subtitle +title: Testing custom properties +--- + +Abstract + +Quite a long description spanning several lines + +Testing document properties diff --git a/test/headers.docx b/test/headers.docx new file mode 100644 index 0000000..c0e1c75 Binary files /dev/null and b/test/headers.docx differ diff --git a/test/headers.md b/test/headers.md new file mode 100644 index 0000000..17f5b05 --- /dev/null +++ b/test/headers.md @@ -0,0 +1,25 @@ +# A Test of Headers + +## Second Level + +Some plain text. + +### Third level + +Some more plain text. + +#### Fourth level + +Some more plain text. + +##### Fifth level + +Some more plain text. + +###### Sixth level + +Some more plain text. + +Seventh level + +Since no Heading 7 style exists in styles.xml, this gets converted to Span. diff --git a/test/image.docx b/test/image.docx new file mode 100644 index 0000000..92e5a43 Binary files /dev/null and b/test/image.docx differ diff --git a/test/image.md b/test/image.md new file mode 100644 index 0000000..e72df12 --- /dev/null +++ b/test/image.md @@ -0,0 +1,20 @@ +No width given: + +![testimg] + +With height 10cm: + +![2testimg] + +With width 6cm: + +![3testimg] + +# With height 3in and width 6in: + +![4testimg] + + [testimg]: ./test/media/rId20.jpg {width="2.0833333333333335in" height="2.0833333333333335in"} + [2testimg]: ./test/media/rId20.jpg {width="3.9370067804024496in" height="3.9370067804024496in"} + [3testimg]: ./test/media/rId20.jpg {width="2.3622036307961505in" height="2.3622036307961505in"} + [4testimg]: ./test/media/rId20.jpg {width="5.833333333333333in" height="2.9166666666666665in"} diff --git a/test/inline_code.docx b/test/inline_code.docx new file mode 100644 index 0000000..9329d98 Binary files /dev/null and b/test/inline_code.docx differ diff --git a/test/inline_code.md b/test/inline_code.md new file mode 100644 index 0000000..5b961d9 --- /dev/null +++ b/test/inline_code.md @@ -0,0 +1 @@ +This is an example of `inline code` with three spaces. diff --git a/test/inline_formatting.docx b/test/inline_formatting.docx new file mode 100644 index 0000000..44ded95 Binary files /dev/null and b/test/inline_formatting.docx differ diff --git a/test/inline_formatting.md b/test/inline_formatting.md new file mode 100644 index 0000000..76d867c --- /dev/null +++ b/test/inline_formatting.md @@ -0,0 +1,10 @@ +Regular text *italics* **bold *bold italics***. + +This is [Small Caps]{.smallcaps}, and this is ~~strikethrough~~. + +Some people use [single underlines for *emphasis*]{.underline}. + +Above the line is ^superscript^ and below the line is ~subscript~. + +A line\ +break. diff --git a/test/inline_images.docx b/test/inline_images.docx new file mode 100644 index 0000000..46f65f2 Binary files /dev/null and b/test/inline_images.docx differ diff --git a/test/inline_images.md b/test/inline_images.md new file mode 100644 index 0000000..b1494dd --- /dev/null +++ b/test/inline_images.md @@ -0,0 +1,7 @@ +This picture ![This one is green and looks like Sideshow Bob.] is an identicon. + +Here is [one ![This one is reddish, and looks like a heart that has leaked out.] that] links. + + [This one is green and looks like Sideshow Bob.]: ./test/media/rId20.jpg "First identicon" {width="0.8888888888888888in" height="0.8888888888888888in"} + [This one is reddish, and looks like a heart that has leaked out.]: ./test/media/rId20.jpg "Second identicon" {width="0.8888888888888888in" height="0.8888888888888888in"} + [one ![This one is reddish, and looks like a heart that has leaked out.] that]: http://www.google.com diff --git a/test/link_in_notes.docx b/test/link_in_notes.docx new file mode 100644 index 0000000..c08f3f4 Binary files /dev/null and b/test/link_in_notes.docx differ diff --git a/test/link_in_notes.md b/test/link_in_notes.md new file mode 100644 index 0000000..38a7527 --- /dev/null +++ b/test/link_in_notes.md @@ -0,0 +1,3 @@ +This is a test[^1]. + +[^1]: diff --git a/test/links.docx b/test/links.docx new file mode 100644 index 0000000..52de02a Binary files /dev/null and b/test/links.docx differ diff --git a/test/links.md b/test/links.md new file mode 100644 index 0000000..619812e --- /dev/null +++ b/test/links.md @@ -0,0 +1,18 @@ +## An internal link and an external link + +An [external link] to a popular website. + +An [external link][1] to a website with an anchor. + +An [internal link] to a section header. + +An [internal link][2] to a bookmark. + +## A section for testing link targets + +A bookmark right []{#my_bookmark .anchor}here + + [external link]: http://google.com + [1]: http://pandoc.org/README.html#synopsis + [internal link]: #a-section-for-testing-link-targets + [2]: #my_bookmark diff --git a/test/lists.docx b/test/lists.docx new file mode 100644 index 0000000..ed5c167 Binary files /dev/null and b/test/lists.docx differ diff --git a/test/lists.md b/test/lists.md new file mode 100644 index 0000000..196b31a --- /dev/null +++ b/test/lists.md @@ -0,0 +1,29 @@ +## Some nested lists + +1. one + +2. two + + a. a + + b. b + +- one + +- two + + - three + + - four + + ```{=html} + + ``` + - Sub paragraph + +- Same list + +```{=html} + +``` +- Different list adjacent to the one above. diff --git a/test/lists_continuing.docx b/test/lists_continuing.docx new file mode 100644 index 0000000..770d726 Binary files /dev/null and b/test/lists_continuing.docx differ diff --git a/test/lists_continuing.md b/test/lists_continuing.md new file mode 100644 index 0000000..a87932b --- /dev/null +++ b/test/lists_continuing.md @@ -0,0 +1,9 @@ +1. Foo + +2. Bar + +3. Baz + +Interruption. + +4. Bop diff --git a/test/lists_div_bullets.docx b/test/lists_div_bullets.docx new file mode 100644 index 0000000..c330757 Binary files /dev/null and b/test/lists_div_bullets.docx differ diff --git a/test/lists_div_bullets.md b/test/lists_div_bullets.md new file mode 100644 index 0000000..7d1fe57 --- /dev/null +++ b/test/lists_div_bullets.md @@ -0,0 +1,16 @@ +- one + +```{=html} + +``` +- two + +```{=html} + +``` +- # three + +```{=html} + +``` +- four diff --git a/test/lists_multiple_initial.docx b/test/lists_multiple_initial.docx new file mode 100644 index 0000000..3c7bcc8 Binary files /dev/null and b/test/lists_multiple_initial.docx differ diff --git a/test/lists_multiple_initial.md b/test/lists_multiple_initial.md new file mode 100644 index 0000000..7fea7e2 --- /dev/null +++ b/test/lists_multiple_initial.md @@ -0,0 +1,7 @@ +1. a. foo + + b. bar + +- - foo + + - bar diff --git a/test/lists_restarting.docx b/test/lists_restarting.docx new file mode 100644 index 0000000..ff8d041 Binary files /dev/null and b/test/lists_restarting.docx differ diff --git a/test/lists_restarting.md b/test/lists_restarting.md new file mode 100644 index 0000000..a017bbd --- /dev/null +++ b/test/lists_restarting.md @@ -0,0 +1,9 @@ +2. Foo + +3. Bar + +4. Baz + +> Interruption + +1. Bop. diff --git a/test/media/rId20.jpg b/test/media/rId20.jpg new file mode 100644 index 0000000..5a50fc0 Binary files /dev/null and b/test/media/rId20.jpg differ diff --git a/test/nested_anchors_in_header.docx b/test/nested_anchors_in_header.docx new file mode 100644 index 0000000..61d9222 Binary files /dev/null and b/test/nested_anchors_in_header.docx differ diff --git a/test/nested_anchors_in_header.md b/test/nested_anchors_in_header.md new file mode 100644 index 0000000..1152b05 --- /dev/null +++ b/test/nested_anchors_in_header.md @@ -0,0 +1,34 @@ +# Оглавление + +[Short instructions] + +[Some instructions] + +[Remote folder or longlonglonglonglong file with manymanymanymany letters inside opening] + +[Remote folder or longlonglonglonglong file with manymanymanymany letters inside closing] + +# Short instructions + +[Open remote folder][Remote folder or longlonglonglonglong file with manymanymanymany letters inside opening] + +Do staff + +[Close remote folder][Remote folder or longlonglonglonglong file with manymanymanymany letters inside closing] + +# Some instructions + +Lines + +## Remote folder or longlonglonglonglong file with manymanymanymany letters inside opening + +Open folder + +## Remote folder or longlonglonglonglong file with manymanymanymany letters inside closing + +Close folder + + [Short instructions]: #short-instructions + [Some instructions]: #some-instructions + [Remote folder or longlonglonglonglong file with manymanymanymany letters inside opening]: #X49da2d776f7a640cd76098979e5788f8119bc44 + [Remote folder or longlonglonglonglong file with manymanymanymany letters inside closing]: #Xb95b585046f38c7739779215f99b6b21152b861 diff --git a/test/notes.docx b/test/notes.docx new file mode 100644 index 0000000..c6cb5cc Binary files /dev/null and b/test/notes.docx differ diff --git a/test/notes.md b/test/notes.md new file mode 100644 index 0000000..c2a421e --- /dev/null +++ b/test/notes.md @@ -0,0 +1,7 @@ +## A footnote + +Test footnote.[^1] Test endnote.[^2] + +[^1]: My note. + +[^2]: This is an endnote at the end of the document. diff --git a/test/raw-blocks.docx b/test/raw-blocks.docx new file mode 100644 index 0000000..020b6b0 Binary files /dev/null and b/test/raw-blocks.docx differ diff --git a/test/raw-blocks.md b/test/raw-blocks.md new file mode 100644 index 0000000..b94bf54 --- /dev/null +++ b/test/raw-blocks.md @@ -0,0 +1,5 @@ +Cell compartments + + ---------- ---------- + Ribosome Lysosome + ---------- ---------- diff --git a/test/raw-bookmarks.docx b/test/raw-bookmarks.docx new file mode 100644 index 0000000..118dcc0 Binary files /dev/null and b/test/raw-bookmarks.docx differ diff --git a/test/raw-bookmarks.md b/test/raw-bookmarks.md new file mode 100644 index 0000000..6e60e40 --- /dev/null +++ b/test/raw-bookmarks.md @@ -0,0 +1,5 @@ +Manual endnotes. + +Nullam eu ante vel est convallis dignissim. Nunc porta vulputate tellus. Nunc rutrum turpis sed pede. Sed bibendum.Aliquam posuere. + +Nunc aliquet, augue nec adipiscing interdum, lacus tellus malesuada massa, quis varius mi purus non odio.Pellentesque condimentum, magna ut suscipit hendrerit, ipsum augue ornare nulla, non luctus diam neque sit amet urna. Curabitur vulputate vestibulum lorem. diff --git a/test/table_one_row.docx b/test/table_one_row.docx new file mode 100644 index 0000000..4ea9d8d Binary files /dev/null and b/test/table_one_row.docx differ diff --git a/test/table_one_row.md b/test/table_one_row.md new file mode 100644 index 0000000..36a7991 --- /dev/null +++ b/test/table_one_row.md @@ -0,0 +1,3 @@ + ----- ----- ------- + One Row Table + ----- ----- ------- diff --git a/test/table_with_list_cell.docx b/test/table_with_list_cell.docx new file mode 100644 index 0000000..75851eb Binary files /dev/null and b/test/table_with_list_cell.docx differ diff --git a/test/table_with_list_cell.md b/test/table_with_list_cell.md new file mode 100644 index 0000000..d0b80e9 --- /dev/null +++ b/test/table_with_list_cell.md @@ -0,0 +1,9 @@ ++-----------------+--------------------+ +| Cell with text | Cell with text | ++=================+====================+ +| - Cell with | 1. Cell with | +| | | +| - A | 2. A | +| | | +| - Bullet list | 3. Numbered list. | ++-----------------+--------------------+ diff --git a/test/tables-default-widths.docx b/test/tables-default-widths.docx new file mode 100644 index 0000000..d6bf5cc Binary files /dev/null and b/test/tables-default-widths.docx differ diff --git a/test/tables-default-widths.md b/test/tables-default-widths.md new file mode 100644 index 0000000..46a57d0 --- /dev/null +++ b/test/tables-default-widths.md @@ -0,0 +1,22 @@ +## A table, with and without a header row + + Name Game Fame Blame + ---------------- ------------ ----------- ------------------- + Lebron James Basketball Very High Leaving Cleveland + Ryan Braun Baseball Moderate Steroids + Russell Wilson Football High Tacky uniform + + --------- -------- + Sinple Table + Without Header + --------- -------- + ++----------------+---------+ +| Simple | Table | +| | | +| Multiparagraph | Full | ++----------------+---------+ +| Of | In each | +| | | +| Paragraphs | Cell. | ++----------------+---------+ diff --git a/test/tables.docx b/test/tables.docx new file mode 100644 index 0000000..78abdcc Binary files /dev/null and b/test/tables.docx differ diff --git a/test/tables.md b/test/tables.md new file mode 100644 index 0000000..46a57d0 --- /dev/null +++ b/test/tables.md @@ -0,0 +1,22 @@ +## A table, with and without a header row + + Name Game Fame Blame + ---------------- ------------ ----------- ------------------- + Lebron James Basketball Very High Leaving Cleveland + Ryan Braun Baseball Moderate Steroids + Russell Wilson Football High Tacky uniform + + --------- -------- + Sinple Table + Without Header + --------- -------- + ++----------------+---------+ +| Simple | Table | +| | | +| Multiparagraph | Full | ++----------------+---------+ +| Of | In each | +| | | +| Paragraphs | Cell. | ++----------------+---------+ diff --git a/test/tables_separated_with_rawblock.docx b/test/tables_separated_with_rawblock.docx new file mode 100644 index 0000000..4cf5518 Binary files /dev/null and b/test/tables_separated_with_rawblock.docx differ diff --git a/test/tables_separated_with_rawblock.md b/test/tables_separated_with_rawblock.md new file mode 100644 index 0000000..ef94ec0 --- /dev/null +++ b/test/tables_separated_with_rawblock.md @@ -0,0 +1,7 @@ + --- --- + a b + --- --- + + --- --- + c d + --- --- diff --git a/test/track_changes_deletion.docx b/test/track_changes_deletion.docx new file mode 100644 index 0000000..dafce17 Binary files /dev/null and b/test/track_changes_deletion.docx differ diff --git a/test/track_changes_deletion.md b/test/track_changes_deletion.md new file mode 100644 index 0000000..a0b2f5c --- /dev/null +++ b/test/track_changes_deletion.md @@ -0,0 +1 @@ +This is a text with a deletion. diff --git a/test/track_changes_insertion.docx b/test/track_changes_insertion.docx new file mode 100644 index 0000000..bf1f5eb Binary files /dev/null and b/test/track_changes_insertion.docx differ diff --git a/test/track_changes_insertion.md b/test/track_changes_insertion.md new file mode 100644 index 0000000..2830e0e --- /dev/null +++ b/test/track_changes_insertion.md @@ -0,0 +1 @@ +This is a text with two exciting insertions. diff --git a/test/track_changes_move.docx b/test/track_changes_move.docx new file mode 100644 index 0000000..cf5174d Binary files /dev/null and b/test/track_changes_move.docx differ diff --git a/test/track_changes_move.md b/test/track_changes_move.md new file mode 100644 index 0000000..8f6f350 --- /dev/null +++ b/test/track_changes_move.md @@ -0,0 +1,5 @@ +Here is some text. + +Here is the text to be moved. + +Here is some more text. diff --git a/test/track_changes_scrubbed_metadata.docx b/test/track_changes_scrubbed_metadata.docx new file mode 100644 index 0000000..e8b2c52 Binary files /dev/null and b/test/track_changes_scrubbed_metadata.docx differ diff --git a/test/track_changes_scrubbed_metadata.md b/test/track_changes_scrubbed_metadata.md new file mode 100644 index 0000000..7054544 --- /dev/null +++ b/test/track_changes_scrubbed_metadata.md @@ -0,0 +1 @@ +Here is a document. diff --git a/test/unicode.docx b/test/unicode.docx new file mode 100644 index 0000000..dd8142e Binary files /dev/null and b/test/unicode.docx differ diff --git a/test/unicode.md b/test/unicode.md new file mode 100644 index 0000000..1e98c7a --- /dev/null +++ b/test/unicode.md @@ -0,0 +1 @@ +Hello, 世界. This costs €10.∨∨( diff --git a/test/verbatim_subsuper.docx b/test/verbatim_subsuper.docx new file mode 100644 index 0000000..e7f6ea9 Binary files /dev/null and b/test/verbatim_subsuper.docx differ diff --git a/test/verbatim_subsuper.md b/test/verbatim_subsuper.md new file mode 100644 index 0000000..92b34ca --- /dev/null +++ b/test/verbatim_subsuper.md @@ -0,0 +1,15 @@ +m^2^ + +m^`2`^ + +`m`^2^ + +`m`^`2`^ + +m~2~ + +m~`2`~ + +`m`~2~ + +`m`~`2`~