Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Notebook to markdown GitHub workflow #518

Draft
wants to merge 4 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
75 changes: 75 additions & 0 deletions .github/scripts/post_process_notebook.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
#!/bin/usr/python

import os
import re
import argparse


def add_import_statement():
# Add CTA import statement
return "import { CTAButtons } from '@site/src/components/CTAButtons/CTAButtons.tsx'\n\n"

def extract_href_links_from_markdown(markdown_text):
# Define the regex pattern to match href attribute value in anchor tags
href_pattern = r'<a\s+href="([^"]+)"'

# Use re.findall() to find all href attribute values in the Markdown text
href_links = re.findall(href_pattern, markdown_text)
return href_links

def format_CTA_button(href_links):
# Find index where colab URL link is
indices = [index for (index, item) in enumerate(href_links) if "colab" in item]
# Only get the first URL link
if len(indices) == 1:
cta_button = "<CTAButtons colab_button='"+ href_links[0] + "'/>"
return cta_button
else:
return ''

def remove_patterns_from_markdown(markdown_text):
# Define the regex patterns to match <img> tags and the specified comment
img_pattern = r'<img[^>]+>'
div_pattern = r'<div\b[^>]*>.*?</div>'
comment_pattern = r'<!---\s*@wandbcode\{.*?\}\s*-->'
empty_a_tag_pattern=r'<a\s+[^>]*\s*href\s*=\s*"[^"]*"\s*[^>]*>.*?</a>'

# Use re.sub() to replace all occurrences of the patterns with an empty string
cleaned_text = re.sub(img_pattern, '', markdown_text)
cleaned_text = re.sub(div_pattern, '', cleaned_text)
cleaned_text = re.sub(comment_pattern, '', cleaned_text)
cleaned_text = re.sub(empty_a_tag_pattern, '', cleaned_text)

return cleaned_text


def main(args):

for colab in args.colab_notebooks:
print(colab)
# Read the content of the input Markdown file
with open(colab, 'r') as file:
markdown_text = file.read()

# Extract href links from the Markdown content
href_links = extract_href_links_from_markdown(markdown_text)

# Create CTA button format
colab_button_markdown = format_CTA_button(href_links)

# Modify the Markdown content (e.g., remove <img> tags and specified comment)
cleaned_markdown = remove_patterns_from_markdown(markdown_text)

# Write the modified Markdown content to the output file
with open(colab, 'w') as file:
file.write(add_import_statement())
file.write(colab_button_markdown)
#file.write(add_title(title)) # To do
file.write(cleaned_markdown)
return

if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("colab_notebooks", nargs="*", help="markdown file to process")
args = parser.parse_args()
main(args)
53 changes: 53 additions & 0 deletions .github/scripts/rename_notebook.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
#!/bin/usr/python

import os
import argparse

# no_longer = {
# "RayTune_with_wandb": "",
# "Weights_&_Biases_with_fastai": "",
# "WandB_Prompts_Quickstart":"",
# }

title_mapping = {
"Intro_to_Weights_&_Biases": "experiments",
"Pipeline_Versioning_with_W&B_Artifacts": "artifacts",
"Model_Registry_E2E": "models",
"W&B_Tables_Quickstart": "tables",
"Organizing_Hyperparameter_Sweeps_in_PyTorch_with_W&B": "sweeps",
"Using_W&B_Sweeps_with_XGBoost": "xgboost_sweeps",
"Simple_PyTorch_Integration": "pytorch",
"Huggingface_wandb": "huggingface",
"Hyperparameter_Optimization_in_TensorFlow_using_W&B_Sweeps": "tensorflow_sweeps",
"Image_Classification_using_PyTorch_Lightning": "lightning",
"Simple_TensorFlow_Integration": "tensorflow",
"Use_WandbMetricLogger_in_your_Keras_workflow": "keras",
"Use_WandbEvalCallback_in_your_Keras_workflow": "keras_table",
"Use_WandbModelCheckpoint_in_your_Keras_workflow": "keras_models",
}

def rename_markdown_file(filename, title_names):
"Checking if we need to rename markdown file..."
# Check if .ipynb name exists in our mapping
base_name = os.path.basename(filename).split('.')[0]
if base_name in title_names:
new_filename = title_names[base_name]

# Rename file
print(f"Renaming notebook from {filename} to {new_filename}.md")
os.rename(filename, new_filename+".md")
else:
print(f"No title match found. {filename} reserved.")


def main(args):
print(args.file)
for markdown_file in args.file:
rename_markdown_file(markdown_file, title_mapping)
return

if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("file", nargs="*", help="Notebook to check if it needs converting")
args = parser.parse_args()
main(args)
79 changes: 79 additions & 0 deletions .github/workflows/create_markdown.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
name: Convert Jupyter notebooks to markdown files
on:
pull_request:
types: [opened]

jobs:
convert_to_markdown:
name: Convert Jupyter Notebooks to Markdown
runs-on: ubuntu-latest
outputs:
generated_markdown_files: ${{steps.convert_notebooks.outputs.generated_markdown_files}}

steps:
- name: Checkout repository
uses: actions/checkout@v2

- name: Install Python
uses: actions/setup-python@v4
with:
python-version: '3.10'

- name: Install Python dependencies
run: pip install -r requirements.txt

- name: Find Modified Jupyter Notebooks
id: find_notebooks
run: |
# Get notebooks that were modified
NOTEBOOK_FILES=$(git diff --name-only --diff-filter=AMR HEAD^ HEAD | grep -E "\.ipynb$" | tr '\n' ' ')
# Pass the list to the next step
echo "NOTEBOOK_FILES=$NOTEBOOK_FILES" >> $GITHUB_ENV

- name: Convert Jupyter Notebooks to Markdown
if: $NOTEBOOK_FILES != ''
id: convert_notebooks
run: |
# Retrieve notebook file names from previous step
for notebook_file in $NOTEBOOK_FILES; do
jupyter nbconvert --to markdown "$notebook_file"
# Check if conversion was successful
if [ $? -ne 0 ]; then
echo "Error: Conversion of $notebook_file to Markdown failed."
exit 1
fi
done
# Get the list of generated markdown files
GENERATED_MARKDOWN_FILES=$(echo "$NOTEBOOK_FILES" | grep -E "\.md$" | tr '\n' ' ')
# Pass the list to the next step
echo "GENERATED_MARKDOWN_FILES=$GENERATED_MARKDOWN_FILES" >> $GITHUB_OUTPUT


post_process_markdown:
name: Post-process Markdown Files
needs: convert_to_markdown
runs-on: ubuntu-latest

steps:
- env:
GENERATED_MARKDOWN_FILES: ${{needs.convert_to_markdown.outputs.generated_markdown_files}}

- name: Checkout code
uses: actions/checkout@v2

- name: Install Python
uses: actions/setup-python@v3
with:
python-version: '3.10'

- name: Post-process Markdown Files
id: post_process_markdown
if: $GENERATED_MARKDOWN_FILES != ''
run: |
# Retrieve generated markdown file names
PROCESSED_MARKDOWN=$(python post_process_notebook.py $GENERATED_MARKDOWN_FILES)
echo "PROCESSED_MARKDOWN=$PROCESSED_MARKDOWN" >> $GITHUB_ENV

- name: Rename markdown Files
if: $PROCESSED_MARKDOWN != ''
run: python rename_notebook.py $PROCESSED_MARKDOWN
1 change: 1 addition & 0 deletions .github/workflows/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
nbdev
Loading