Skip to content

Commit

Permalink
Update logic to skip non-readable repo file
Browse files Browse the repository at this point in the history
  • Loading branch information
Jet Xu committed Dec 7, 2024
1 parent 9d9cfa4 commit e242564
Show file tree
Hide file tree
Showing 5 changed files with 85 additions and 56 deletions.
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -160,4 +160,6 @@ cython_debug/
#.idea/

.DS_Store
tests/self/
tests/self/
tests/self/*
tests/self/*/*
127 changes: 75 additions & 52 deletions llama_github/data_retrieval/github_entities.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,84 +161,107 @@ def get_file_content(self, file_path: str, sha: Optional[str] = None) -> Optiona

# Skip files that don't need processing
if any(file_path.endswith(ext) for ext in [
# Package manager and dependencies
'.lock', '.pnpm-lock.yaml', 'package-lock.json', 'Gemfile.lock',
'poetry.lock', 'Cargo.lock', 'composer.lock',
'.pyc', '.pyo', 'requirements.txt', '.gitignore',

# Binaries and compiled files
'.exe', '.dll', '.so', '.dylib', '.bin', '.obj', '.o', '.a',
# Binary and Compiled Files
'.exe', '.dll', '.so', '.dylib', '.bin', '.obj', '.o', '.a',
'.lib', '.jar', '.war', '.ear', '.class', '.pdb', '.ilk', '.exp',
'.apk', '.aab', '.ipa', # Mobile apps
'.wasm', # WebAssembly
'.apk', '.aab', '.ipa', '.wasm',

# Media and compressed files
# Media Files
'.png', '.jpg', '.jpeg', '.gif', '.ico', '.bmp', '.tiff', '.webp',
'.svg', '.eps', '.psd', '.ai', '.sketch',
'.mp3', '.mp4', '.wav', '.flac', '.ogg', '.m4a',
'.avi', '.mov', '.mkv', '.webm', '.wmv', '.flv',
'.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx',
'.ttf', '.otf', '.eot', '.woff', '.woff2',

# Compressed and Binary Data
'.zip', '.rar', '.7z', '.tar', '.gz', '.bz2', '.xz', '.tgz',
'.pkl', '.pickle',
'.npy', '.npz',
'.h5', '.hdf5',

# System and hidden files
'.DS_Store', 'Thumbs.db', '.dockerignore',
'.gitattributes', '.gitmodules',
# Lock Files and Dependencies
'.lock', 'package-lock.json', 'yarn.lock', 'pnpm-lock.yaml',
'Gemfile.lock', 'poetry.lock', 'Cargo.lock', 'composer.lock',

# Documentation and resource files
'.min.js', '.min.css', '.map', '.po', '.mo', '.pot', '.drawio',
'.ttf', '.otf', '.eot', '.woff', '.woff2', # Fonts
# Compiled Python
'.pyc', '.pyo',

# Configuration and data files
'.conf', '.config', '.cfg', '.ini',
'.sqlite', '.db', '.mdb', '.sql',
'.pb', '.pbtxt', # Protocol buffers
'.ipynb', # Jupyter notebooks
'.pkl', '.pickle', # Python serialized objects
'.tfrecords', '.tf', # TensorFlow files
'.onnx', # ONNX models
'.h5', '.hdf5', # HDF5 files
'.npy', '.npz', # NumPy files
# System and Hidden Files
'.DS_Store', 'Thumbs.db',

# Build outputs
# Generated Code Files
'.g.dart', '.freezed.dart',
'.pb.go',
'_pb2.py', '_pb2_grpc.py',
'.generated.ts', '.generated.tsx',
'.proto.ts', '.proto.js',
'.min.js', '.min.css',
'.bundle.js', '.bundle.css',
'.chunk.js', '.chunk.css'
'.chunk.js', '.chunk.css',

# IDE Generated
'.pbxproj', '.xcworkspacedata',
'.csproj.user', '.suo',
'.iml', '.ipr', '.iws',

# Map Files
'.map', '.js.map', '.css.map'

]) or any(pattern in file_path for pattern in [
# Special directories
'/node_modules/',
# Cache and Temporary Directories
'/__pycache__/',
'/.git/',
'/.idea/',
'/.vscode/',
'/.vs/',
'/.svn/',
'/.hg/',
'/dist/',
'/build/',
'/target/',
'/out/',
'/bin/',
'/obj/',
'/Debug/',
'/Release/',
'/.next/',
'/.nuxt/',
'/vendor/',
'/venv/',
'/.env',
'/coverage/',
'/logs/',
'/.github/',
'/assets/',
'/public/assets/',
'/static/assets/',
'/.pytest_cache/',
'/.sass-cache/',
'/.parcel-cache/',
'/.cache/',
'/tmp/',
'/temp/'
'/temp/',

# Package Manager Directories
'/node_modules/',
'/bower_components/',

# Test Coverage and Reports
'/coverage/',
'/.nyc_output/',
'/.pytest_cache/',
'/.tox/',

# Environment and Runtime
'/venv/',
'/.env/',
'/.virtualenv/',

# Framework Generated
'/.dart_tool/',
'/.pub-cache/',
'/.angular/',
'/.nuxt/',
'/.next/',
'/.ipynb_checkpoints/',

# CI/CD
'/.github/workflows/',
'/.gitlab/ci/',
'/.circleci/',

# Logs
'/logs/',
'/log/',

# Binary Assets
'/assets/images/',
'/assets/fonts/',
'/assets/media/',
'/public/images/',
'/public/fonts/',
'/static/images/',
'/static/fonts/'
]):
logger.debug(f"Skipping non-processable file: {file_path}")
return None
Expand Down
6 changes: 5 additions & 1 deletion llama_github/llm_integration/initial_load.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,11 @@ def __init__(self,
elif mistral_api_key is not None and mistral_api_key != "" and self.llm is None:
logger.info("Initializing Mistral API...")
self.llm = ChatMistralAI(mistral_api_key=mistral_api_key, model="mistral-large-2411")
self.llm_simple = ChatMistralAI(mistral_api_key=mistral_api_key, model="open-mistral-nemo")
self.llm_simple = ChatMistralAI(
mistral_api_key=mistral_api_key,
model="open-mistral-nemo",
temperature=0.2
)
self.model_type = "OpenAI"
elif openai_api_key is not None and openai_api_key != "" and self.llm is None:
logger.info("Initializing OpenAI API...")
Expand Down
2 changes: 1 addition & 1 deletion llama_github/version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = '0.2.3'
__version__ = '0.2.4'
2 changes: 1 addition & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[metadata]
name = llama-github
version = 0.2.3
version = 0.2.4
author = Jet Xu
author_email = [email protected]
description = Llama-github is an open-source Python library that empowers LLM Chatbots, AI Agents, and Auto-dev Agents to conduct Retrieval from actively selected GitHub public projects. It Augments through LLMs and Generates context for any coding question, in order to streamline the development of sophisticated AI-driven applications.
Expand Down

0 comments on commit e242564

Please sign in to comment.