Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -356,6 +356,7 @@ used to overwrite the defaults.
- `RF_PID_FILE` - File to store process ids of started services (default: ${RF_HOME}/rapidfire_pids.txt)
- `RF_PYTHON_EXECUTABLE` - Python executable (default: python3 falls back to python if not found)
- `RF_PIP_EXECUTABLE` - pip executable (default: pip3 falls back to pip if not found)
- `RF_CONVERGE_MODE` - Whether to use Rapidfire AI Converge frontend and backend if available (default: all)

## Community & Governance

Expand Down
11 changes: 11 additions & 0 deletions rapidfireai/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@

from .version import __version__

RF_CONVERGE_MODE = os.getenv("RF_CONVERGE_MODE", "all")

def get_script_path():
"""Get the path to the start.sh script.
Expand Down Expand Up @@ -456,6 +457,13 @@ def main():

parser.add_argument("--log-lines", type=int, default=10, help="Number of lines to log to the console")

parser.add_argument(
"--converge",
choices=["all", "none", "backend", "frontend"],
default=RF_CONVERGE_MODE,
help="Converge mode: all (default, start converge backend+frontend), none (use original frontend, do not start converge), backend (only converge backend), frontend (only converge frontend)",
)

args = parser.parse_args()

# Set environment variables from CLI args
Expand All @@ -481,6 +489,9 @@ def main():
if args.force:
os.environ["RF_FORCE"] = "true"

# Converge mode (all|none|backend|frontend) for start script
os.environ["RF_CONVERGE_MODE"] = args.converge

# Handle doctor command separately
if args.command == "doctor":
return run_doctor(args.log_lines)
Expand Down
3 changes: 3 additions & 0 deletions rapidfireai/utils/doctor.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ def get_doctor_info(log_lines: int = 10):
"mlflow",
"torch",
"transformers",
"protobuf",
"flask",
"gunicorn",
"peft",
Expand All @@ -78,6 +79,8 @@ def get_doctor_info(log_lines: int = 10):
"langchain-openai",
"langchain-huggingface",
"langchain-classic",
"langchain-pinecone",
"langchain-postgres",
"unstructured",
"waitress",
"vllm",
Expand Down
4 changes: 2 additions & 2 deletions setup/evals/requirements-colab.txt
Original file line number Diff line number Diff line change
Expand Up @@ -45,5 +45,5 @@ flask-cors>=5.0.1
# Logging
loguru


numpy==2.0.1
numpy==2.0.1
protobuf<6.0.0
4 changes: 3 additions & 1 deletion setup/evals/requirements-local.txt
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@ langchain-postgres>=0.0.17

# Data Manipulation & Display
unstructured>=0.18.15
numpy>=1.26.4,<2.3

# Other
requests==2.32.5
Expand All @@ -41,3 +40,6 @@ mlflow>=3.2.0
gunicorn>=23.0.0
flask-cors>=5.0.1
loguru

numpy==2.0.1
protobuf<6.0.0
133 changes: 127 additions & 6 deletions setup/start.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
# This script starts MLflow server, API server, and frontend tracking server
# Used for pip-installed package mode


set -e # Exit on any error

# Configuration
Expand All @@ -29,6 +30,20 @@ RF_LOG_PATH="${RF_LOG_PATH:=$RF_HOME/logs}"

RF_TIMEOUT_TIME=${RF_TIMEOUT_TIME:=30}

# Converge mode: all (backend+frontend), none (original frontend only), backend, frontend
RF_CONVERGE_MODE=${RF_CONVERGE_MODE:=all}
case "$RF_CONVERGE_MODE" in
all|none|backend|frontend) ;;
*)
echo "Invalid RF_CONVERGE_MODE=$RF_CONVERGE_MODE (expected: all, none, backend, frontend)"
exit 1
;;
esac
RF_CONVERGE_BACKEND_HOST=${RF_CONVERGE_BACKEND_HOST:=0.0.0.0}
RF_CONVERGE_BACKEND_PORT=${RF_CONVERGE_BACKEND_PORT:=8860}
RF_CONVERGE_FRONTEND_HOST=${RF_CONVERGE_FRONTEND_HOST:=$RF_FRONTEND_HOST}
RF_CONVERGE_FRONTEND_PORT=${RF_CONVERGE_FRONTEND_PORT:=$RF_FRONTEND_PORT}

# Colab mode configuration
if [ -z "${COLAB_GPU+x}" ]; then
RF_MLFLOW_ENABLED=${RF_MLFLOW_ENABLED:=true}
Expand Down Expand Up @@ -96,6 +111,11 @@ print_warning() {
echo -e "${YELLOW}[$(date '+%Y-%m-%d %H:%M:%S')]${NC} $1"
}

# Return 0 if rapidfireai-pro pip package is installed
has_rapidfireai_pro() {
${RF_PIP_EXECUTABLE} show rapidfireai-pro >/dev/null 2>&1
}

# Function to setup Python environment
setup_python_env() {
print_status "Setting up Python environment..."
Expand Down Expand Up @@ -182,6 +202,9 @@ cleanup() {
pkill -f "gunicorn.*rapidfireai.$RAPIDFIRE_MODE.dispatcher" 2>/dev/null || true
# Only kill Flask server if we're not in Colab (frontend doesn't run in Colab)
pkill -f "python.*rapidfireai/frontend/server.py" 2>/dev/null || true
# Stop Converge if it was running
pkill -f "converge start" 2>/dev/null || true
pkill -f "uvicorn.*main:app" 2>/dev/null || true
fi

print_success "All services stopped"
Expand Down Expand Up @@ -575,6 +598,81 @@ start_frontend() {
return 0
}

# Function to start Converge via converge CLI (mode: all | backend | frontend)
start_converge() {
local mode="${1:-$RF_CONVERGE_MODE}"
print_status "Starting Converge ($mode)..."

# converge start runs in the foreground with its own monitor loop,
# so we launch it in the background and track it like other services.
print_status "Converge logs will be written to: $RF_LOG_PATH/converge.log"

local converge_args="--force"
case "$mode" in
all) ;;
backend) converge_args="$converge_args backend" ;;
frontend) converge_args="$converge_args frontend" ;;
*) converge_args="$converge_args" ;;
esac

if command -v setsid &> /dev/null; then
setsid converge start $converge_args > "$RF_LOG_PATH/converge.log" 2>&1 &
else
nohup converge start $converge_args > "$RF_LOG_PATH/converge.log" 2>&1 &
fi

local converge_pid=$!
echo "$converge_pid Converge" >> "$RF_PID_FILE"

# When starting full stack or frontend, wait for frontend port; backend-only may not serve it
if [[ "$mode" == "backend" ]] || [[ "$mode" == "all" ]]; then
if wait_for_service $RF_CONVERGE_BACKEND_HOST $RF_CONVERGE_BACKEND_PORT "Converge backend" $RF_TIMEOUT_TIME; then
print_success "Converge backend started (PID: $converge_pid)"
else
print_error "Converge backend failed to start. Checking logs..."
if [[ -f "$RF_LOG_PATH/converge.log" ]]; then
echo "=== Last 30 lines of converge.log ==="
tail -30 "$RF_LOG_PATH/converge.log"
echo "=== End of log ==="
if [[ -f "$RF_LOG_PATH/converge_backend.log" ]]; then
echo "=== Last 30 lines of converge_backend.log ==="
tail -30 "$RF_LOG_PATH/converge_backend.log"
echo "=== End of log ==="
else
echo "No converge_backend.log file found"
fi
else
echo "No converge.log file found"
fi
return 1
fi
fi

if [[ "$mode" == "frontend" ]] || [[ "$mode" == "all" ]]; then
if wait_for_service $RF_CONVERGE_FRONTEND_HOST $RF_CONVERGE_FRONTEND_PORT "Converge frontend" $RF_TIMEOUT_TIME; then
print_success "Converge frontend started (PID: $converge_pid)"
else
print_error "Converge frontend failed to start. Checking logs..."
if [[ -f "$RF_LOG_PATH/converge.log" ]]; then
echo "=== Last 30 lines of converge.log ==="
tail -30 "$RF_LOG_PATH/converge.log"
echo "=== End of log ==="
if [[ -f "$RF_LOG_PATH/converge_frontend.log" ]]; then
echo "=== Last 30 lines of converge_frontend.log ==="
tail -30 "$RF_LOG_PATH/converge_frontend.log"
echo "=== End of log ==="
else
echo "No converge_frontend.log file found"
fi
else
echo "No converge.log file found"
fi
return 1
fi
fi
return 0
}

# Function to conditionally start frontend based on mode
start_frontend_if_needed() {
# In Colab mode, always skip frontend
Expand Down Expand Up @@ -727,11 +825,34 @@ start_services() {

# Start frontend server (conditionally)
if [[ "$RF_MLFLOW_ENABLED" == "true" ]]; then
if start_frontend; then
((services_started++))
else
print_error "Failed to start frontend server"
fi
case "$RF_CONVERGE_MODE" in
none)
if start_frontend; then
((services_started++))
else
print_error "Failed to start frontend server"
fi
;;
backend|frontend|all)
if has_rapidfireai_pro; then
if start_converge; then
((services_started++))
else
print_error "Failed to start Converge"
fi
else
if [[ "$RF_CONVERGE_MODE" == "all" ]]; then
if start_frontend; then
((services_started++))
else
print_error "Failed to start frontend server"
fi
else
print_error "rapidfireai-pro is not installed (required for --converge=$RF_CONVERGE_MODE)"
fi
fi
;;
esac
else
print_status "⊗ Skipping frontend (use TensorBoard if in Colab mode)"
fi
Expand Down Expand Up @@ -794,7 +915,7 @@ main() {

# Show summary of all log files for debugging
print_status "=== Startup Failure Summary ==="
for log_file in "mlflow.log" "api.log" "frontend.log"; do
for log_file in "mlflow.log" "api.log" "frontend.log" "converge.log"; do
if [[ -f "$RF_LOG_PATH/$log_file" ]]; then
echo ""
print_status "=== $log_file ==="
Expand Down
Loading