Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
37 commits
Select commit Hold shift + click to select a range
44603ae
Create sample.csv
pranathiir Mar 27, 2025
9ae58ba
Add files via upload
pranathiir Mar 27, 2025
ddce5ed
Delete Stocks Dataset/sample.csv
pranathiir Mar 27, 2025
3207ad4
Added datasets for predictive analytics2
pranathiir Mar 28, 2025
5149af1
Delete Stocks Dataset directory
pranathiir Mar 28, 2025
11660bb
Add NER_LSTM for Claim Appeal Documents code and model files
Apr 13, 2025
02254c2
Add files via upload
pranathiir Apr 13, 2025
664d0b5
Delete src/PredictiveAnalysis2/app_stockmarketprediction.py
pranathiir Apr 13, 2025
ff1b961
Add files via upload
pranathiir Apr 13, 2025
7594d96
Update stocks_app.py
pranathiir Apr 13, 2025
1d915c3
Remove .DS_Store and add to .gitignore
Apr 13, 2025
0e8291d
WIP: temp commit before rebase
Apr 13, 2025
714c149
WIP: temp commit before rebase
Apr 13, 2025
e757f2f
Final NER_LSTM updates
Apr 13, 2025
068f9bd
Delete Datasets/predictive-analytics2/stocks-datasets/nifty50_5y_full…
pranathiir Apr 13, 2025
4e11982
Added dataset for stock market prediction
pranathiir Apr 13, 2025
f1d1c29
Delete src/PredictiveAnalysis2/stocks_app.py
pranathiir Apr 13, 2025
ab14a95
Added streamlit app for stock market prediction
pranathiir Apr 13, 2025
bef91e2
Folder Restructuring
Apr 13, 2025
9841d20
Delete Datasets/predictive-analytics2/stocks-datasets/merged_nifty50_…
pranathiir Apr 13, 2025
1d92148
Delete Datasets/predictive-analytics2/stocks-datasets/nifty50_5y_1d.csv
pranathiir Apr 13, 2025
9e9cded
Delete Datasets/predictive-analytics2/stocks-datasets/nifty50_realtim…
pranathiir Apr 13, 2025
abc0c01
Restructured folders
Apr 15, 2025
1c76667
Delete src/PredictiveAnalysis2/Frontend/stocks_app.py
pranathiir Apr 15, 2025
425d6df
Restructured Folders
pranathiir Apr 15, 2025
d7073a9
Restructured Folders
pranathiir Apr 15, 2025
b60e52a
Updated requirements.txt
pranathiir Apr 15, 2025
3dc7cab
Updated requirements.txt
pranathiir Apr 15, 2025
2e2d0be
Merge branch 'main' into PredictiveAnalysis2
pranathiir Apr 15, 2025
a157455
Update requirements.txt
pranathiir Apr 15, 2025
4cc4f79
Update requirements.txt
pranathiir Apr 15, 2025
3303ab8
Appeals final update
Apr 15, 2025
fd1835e
added module import path
pranathiir Apr 15, 2025
d534991
Add appeal sample docs
Apr 16, 2025
ffd361f
Delete src/PredictiveAnalysis2/Frontend/app.py
pranathiir Apr 16, 2025
86ae515
Delete src/PredictiveAnalysis2/Frontend/stocks_app.py
pranathiir Apr 16, 2025
4d6464a
created app.py with multiple pages
pranathiir Apr 16, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -172,3 +172,4 @@ cython_debug/

# PyPI configuration file
.pypirc
.DS_Store
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
Dear Sir/Madam,
I am writing to appeal the denial of claim CLM654321. The reason provided was "Pre-authorization required". The treatment was provided by Dr. Alan Moore under Cigna.
Please reconsider this decision.
Sincerely,
Jane Doe
Binary file not shown.
Binary file not shown.
61,851 changes: 61,851 additions & 0 deletions Datasets/predictive-analytics2/stocks-datasets/nifty50_5y_full_data.csv

Large diffs are not rendered by default.

107 changes: 43 additions & 64 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,64 +1,43 @@
altair==5.5.0
astroid==3.3.9
attrs==25.3.0
blinker==1.9.0
cachetools==5.5.2
certifi==2025.1.31
charset-normalizer==3.4.1
click==8.1.8
colorama==0.4.6
dill==0.3.9
et_xmlfile==2.0.0
exceptiongroup==1.2.2
gitdb==4.0.12
GitPython==3.1.44
idna==3.10
iniconfig==2.1.0
isort==6.0.1
Jinja2==3.1.6
joblib==1.4.2
jsonschema==4.23.0
jsonschema-specifications==2024.10.1
markdown-it-py==3.0.0
MarkupSafe==3.0.2
mccabe==0.7.0
mdurl==0.1.2
narwhals==1.32.0
numpy==2.2.4
openpyxl==3.1.5
packaging==24.2
pandas==2.2.3
patsy==1.0.1
pillow==11.1.0
platformdirs==4.3.7
plotly==6.0.1
pluggy==1.5.0
prettytable==3.16.0
protobuf==5.29.4
pyarrow==19.0.1
pydeck==0.9.1
Pygments==2.19.1
pylint==3.3.6
pytest==8.3.5
python-dateutil==2.9.0.post0
pytz==2025.2
referencing==0.36.2
requests==2.32.3
rich==13.9.4
rpds-py==0.24.0
scikit-learn==1.6.1
scipy==1.15.2
six==1.17.0
smmap==5.0.2
statsmodels==0.14.4
streamlit==1.44.0
tenacity==9.0.0
threadpoolctl==3.6.0
toml==0.10.2
tomli==2.2.1
tomlkit==0.13.2
tornado==6.4.2
typing_extensions==4.13.0
tzdata==2025.2
urllib3==2.3.0
watchdog==6.0.0
absl-py==2.0.0
astunparse==1.6.3
cachetools==5.3.0
certifi==2022.12.7
chardet==4.0.0
charset-normalizer==3.1.0
Faker==25.9.1
flatbuffers==25.2.10
gast==0.4.0
google-pasta==0.2.0
grpcio==1.71.0
h5py==3.13.0
idna==2.10
ipykernel==6.25.1
ipython==8.14.0
Jinja2==3.1.2
keras==3.9.2
MarkupSafe==2.1.3
ml-dtypes==0.4.1
numpy==1.26.4
opt-einsum==3.4.0
pandas==2.0.2
plotly==5.16.1
protobuf==3.20.3
pylint==3.3.6
pytest==8.3.5
PyPDF2==3.0.1
python-docx==1.1.2
requests==2.31.0
scikit-learn==1.3.0
six==1.16.0
statsmodels==0.14.4
streamlit==1.41.0
tensorboard==2.18.0
tensorboard-data-server==0.7.2
tensorboard-plugin-wit==1.8.1
tensorflow==2.18.0
tensorflow-io-gcs-filesystem==0.31.0
typing_extensions==4.12.2
urllib3==1.26.15
Werkzeug==3.0.5
wrapt==1.15.0
yfinance==0.2.55
39 changes: 39 additions & 0 deletions src/PredictiveAnalysis2/Backend/data_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
"""
data_utils.py file for Streamlit application
"""

import pandas as pd
import yfinance as yf
import streamlit as st

def load_full_data(file):
"""Load and preprocess historical stock data from CSV"""
data_frame = pd.read_csv(file, parse_dates=['Date'], dayfirst=True)
data_frame.columns = data_frame.columns.str.strip()
data_frame['Date'] = pd.to_datetime(data_frame['Date'], format='%d-%m-%Y', errors='coerce')
data_frame = data_frame.dropna(subset=['Date']).sort_values('Date')
data_frame.set_index('Date', inplace=True)

numeric_cols = ['Open', 'High', 'Low', 'Close', 'Volume']
data_frame[numeric_cols] = data_frame[numeric_cols].apply(pd.to_numeric, errors='coerce')
data_frame = data_frame.dropna(subset=numeric_cols)

return data_frame[['Stock Name'] + numeric_cols]

@st.cache_data(ttl=300)
def get_realtime_data():
"""Fetch recent weekly NIFTY50 data"""
ticker_data = yf.Ticker("^NSEI")
return ticker_data.history(period="5y", interval="1wk")


@st.cache_data(ttl=300)
def get_realtime_daily_data():
"""Fetch recent daily NIFTY50 data"""
ticker_data = yf.Ticker("^NSEI")
return ticker_data.history(period="2y", interval="1d")

@st.cache_data
def load_and_cache_file(uploaded_file):
"""Cache uploaded CSV file content"""
return load_full_data(uploaded_file)
Binary file added src/PredictiveAnalysis2/Backend/idx_to_label.pkl
Binary file not shown.
1 change: 1 addition & 0 deletions src/PredictiveAnalysis2/Backend/max_len.pkl
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
�K(.
65 changes: 65 additions & 0 deletions src/PredictiveAnalysis2/Backend/model_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
"""
model_utils.py file for Streamlit application
"""

import numpy as np
from tensorflow import keras

def create_sequences(data, sequence_length, feature_index):
"""Create LSTM sequences from time series data"""
x_seq, y_seq = [], []
for i in range(len(data) - sequence_length):
window = data[i:i + sequence_length]
target = data[i + sequence_length, feature_index]
if not np.any(np.isnan(window)) and not np.isnan(target):
x_seq.append(window)
y_seq.append(target)
return np.array(x_seq), np.array(y_seq)


def build_lstm_model(input_shape):
"""Build a simple stacked LSTM model"""
keras.backend.clear_session()
model = keras.Sequential([
keras.layers.LSTM(50, return_sequences=True, input_shape=input_shape),
keras.layers.Dropout(0.2),
keras.layers.LSTM(50, return_sequences=False),
keras.layers.Dropout(0.2),
keras.layers.Dense(25),
keras.layers.Dense(1)
])
model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.001), loss='mean_squared_error')
return model


def build_bidirectional_lstm(sequence_length):
"""Build improved bidirectional LSTM model"""
model = keras.Sequential([
keras.layers.Bidirectional(
keras.layers.LSTM(128, return_sequences=True),
input_shape=(sequence_length, 1)),
keras.layers.Dropout(0.3),
keras.layers.Bidirectional(keras.layers.LSTM(64)),
keras.layers.Dropout(0.3),
keras.layers.Dense(32, activation='relu'),
keras.layers.Dense(1)
])
model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.001), loss='mean_squared_error')
return model


def rolling_forecast(model, normalized_data, sequence_length):
"""Perform rolling forecast using trained model"""
predictions = []
if len(normalized_data) < sequence_length:
return predictions

last_sequence = normalized_data[:sequence_length].reshape(1, sequence_length, -1)

for i in range(sequence_length, len(normalized_data)):
prediction = model.predict(last_sequence, verbose=0)[0][0]
predictions.append(prediction)
next_input = normalized_data[i].reshape(1, 1, -1)
last_sequence = np.append(last_sequence[:, 1:, :], next_input, axis=1)

return predictions
Binary file added src/PredictiveAnalysis2/Backend/ner_model.h5
Binary file not shown.
Binary file added src/PredictiveAnalysis2/Backend/tokenizer.pkl
Binary file not shown.
129 changes: 129 additions & 0 deletions src/PredictiveAnalysis2/Backend/train_model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
""" Program that does the model training """

import pickle
import random
import re

import numpy as np
from faker import Faker
from keras.layers import Bidirectional, Dense, Embedding, Input, LSTM, TimeDistributed
from keras.models import Model
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical

# Initialize Faker
fake = Faker()

# Generate synthetic appeal letters
def generate_appeal_letters(n_samples=500):
"""Generate synthetic appeal letters with entities."""
claim_numbers = [f"CLM{random.randint(100000, 999999)}" for _ in range(50)]
denial_reasons = [
"Service not covered", "Pre-authorization required", "Out of network provider",
"Insufficient documentation", "Medical necessity not established"
]
health_plans = ["BlueCross BlueShield", "United Healthcare", "Aetna", "Cigna", "Medicare"]

letters_list, annotations_list = [], []

for _ in range(n_samples):
claim_num = random.choice(claim_numbers)
reason = random.choice(denial_reasons)
doctor = f"Dr. {fake.name()}"
plan = random.choice(health_plans)

appeal_letter = (
f"Dear Sir/Madam, I am writing to appeal the denial of claim {claim_num}. "
f'The reason provided was "{reason}". '
f'The treatment was provided by {doctor} under {plan}. '
f"Please reconsider this decision. Sincerely, {fake.name()}"
)

entities = [(claim_num, "CLAIM"), (reason, "REASON"), (doctor, "DOCTOR"), (plan, "PLAN")]
letters_list.append(appeal_letter)
annotations_list.append(entities)

return letters_list, annotations_list

# Generate dataset
letters_data, annotations_data = generate_appeal_letters(500)

# Tokenization
tokenizer = Tokenizer(oov_token="OOV")
tokenizer.fit_on_texts(letters_data)
word_index = tokenizer.word_index
total_vocab_size = len(word_index) + 1

# Label mappings
label_to_idx = {
"O": 0, "B-CLAIM": 1, "I-CLAIM": 2,
"B-REASON": 3, "I-REASON": 4,
"B-DOCTOR": 5, "I-DOCTOR": 6,
"B-PLAN": 7, "I-PLAN": 8
}
idx_to_label = {i: l for l, i in label_to_idx.items()}

# Preprocessing
X_data, y_data = [], []
max_sequence_len = max(len(text.split()) for text in letters_data)

for text, annotation in zip(letters_data, annotations_data):
words = text.split()
x = [word_index.get(w, 1) for w in words] # 1 = OOV
labels = ["O"] * len(words)

for entity, tag in annotation:
entity_words = entity.split()
entity_words_cleaned = [re.sub(r"\W+", "", w).lower() for w in entity_words]

for i in range(len(words) - len(entity_words) + 1):
window = words[i:i + len(entity_words)]
window_cleaned = [re.sub(r"\W+", "", w).lower() for w in window]

if window_cleaned == entity_words_cleaned:
labels[i] = f"B-{tag}"
for j in range(1, len(entity_words)):
labels[i + j] = f"I-{tag}"

y_seq = [label_to_idx[l] for l in labels]
X_data.append(x)
y_data.append(y_seq)

X_data = pad_sequences(X_data, maxlen=max_sequence_len, padding='post')
y_data = pad_sequences(y_data, maxlen=max_sequence_len, padding='post')
y_data = np.array([to_categorical(i, num_classes=len(label_to_idx)) for i in y_data])

# Build model
def build_bilstm_model(vocab, num_classes, max_len):
"""Build and return a BiLSTM model for NER."""
inputs = Input(shape=(max_len,))
embedding = Embedding(input_dim=vocab, output_dim=100)(inputs)
lstm = Bidirectional(LSTM(
128,
return_sequences=True,
dropout=0.5,
recurrent_dropout=0.5))(embedding)
dense = TimeDistributed(Dense(num_classes, activation='softmax'))(lstm)
model = Model(inputs, dense)
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
return model

# Train and save
model_instance = build_bilstm_model(total_vocab_size, len(label_to_idx), max_sequence_len)
model_instance.fit(
X_data[:400], y_data[:400],
validation_data=(X_data[400:], y_data[400:]),
epochs=10, batch_size=32
)

model_instance.save("ner_model.h5")

with open("tokenizer.pkl", "wb") as f:
pickle.dump(tokenizer, f)

with open("idx_to_label.pkl", "wb") as f:
pickle.dump(idx_to_label, f)

with open("max_len.pkl", "wb") as f:
pickle.dump(max_sequence_len, f)
Loading