TCS-2021 · codit04 · Apr 16, 2025 · Mar 27, 2025 · Mar 27, 2025 · Mar 27, 2025
diff --git a/.gitignore b/.gitignore
@@ -172,3 +172,4 @@ cython_debug/
 
 # PyPI configuration file
 .pypirc
+.DS_Store
diff --git a/Datasets/predictive-analytics2/appeals-sample-docs/Appeal_doc_1.txt b/Datasets/predictive-analytics2/appeals-sample-docs/Appeal_doc_1.txt
@@ -0,0 +1,5 @@
+Dear Sir/Madam,
+I am writing to appeal the denial of claim CLM654321. The reason provided was "Pre-authorization required". The treatment was provided by Dr. Alan Moore under Cigna.
+Please reconsider this decision.
+Sincerely,
+Jane Doe
diff --git a/Datasets/predictive-analytics2/appeals-sample-docs/Appeal_doc_2.docx b/Datasets/predictive-analytics2/appeals-sample-docs/Appeal_doc_2.docx
diff --git a/Datasets/predictive-analytics2/appeals-sample-docs/Appeal_doc_3.pdf b/Datasets/predictive-analytics2/appeals-sample-docs/Appeal_doc_3.pdf
diff --git a/Datasets/predictive-analytics2/stocks-datasets/nifty50_5y_full_data.csv b/Datasets/predictive-analytics2/stocks-datasets/nifty50_5y_full_data.csv
diff --git a/requirements.txt b/requirements.txt
@@ -1,64 +1,43 @@
-altair==5.5.0
-astroid==3.3.9
-attrs==25.3.0
-blinker==1.9.0
-cachetools==5.5.2
-certifi==2025.1.31
-charset-normalizer==3.4.1
-click==8.1.8
-colorama==0.4.6
-dill==0.3.9
-et_xmlfile==2.0.0
-exceptiongroup==1.2.2
-gitdb==4.0.12
-GitPython==3.1.44
-idna==3.10
-iniconfig==2.1.0
-isort==6.0.1
-Jinja2==3.1.6
-joblib==1.4.2
-jsonschema==4.23.0
-jsonschema-specifications==2024.10.1
-markdown-it-py==3.0.0
-MarkupSafe==3.0.2
-mccabe==0.7.0
-mdurl==0.1.2
-narwhals==1.32.0
-numpy==2.2.4
-openpyxl==3.1.5
-packaging==24.2
-pandas==2.2.3
-patsy==1.0.1
-pillow==11.1.0
-platformdirs==4.3.7
-plotly==6.0.1
-pluggy==1.5.0
-prettytable==3.16.0
-protobuf==5.29.4
-pyarrow==19.0.1
-pydeck==0.9.1
-Pygments==2.19.1
-pylint==3.3.6
-pytest==8.3.5
-python-dateutil==2.9.0.post0
-pytz==2025.2
-referencing==0.36.2
-requests==2.32.3
-rich==13.9.4
-rpds-py==0.24.0
-scikit-learn==1.6.1
-scipy==1.15.2
-six==1.17.0
-smmap==5.0.2
-statsmodels==0.14.4
-streamlit==1.44.0
-tenacity==9.0.0
-threadpoolctl==3.6.0
-toml==0.10.2
-tomli==2.2.1
-tomlkit==0.13.2
-tornado==6.4.2
-typing_extensions==4.13.0
-tzdata==2025.2
-urllib3==2.3.0
-watchdog==6.0.0
+absl-py==2.0.0
+astunparse==1.6.3
+cachetools==5.3.0
+certifi==2022.12.7
+chardet==4.0.0
+charset-normalizer==3.1.0
+Faker==25.9.1
+flatbuffers==25.2.10
+gast==0.4.0
+google-pasta==0.2.0
+grpcio==1.71.0
+h5py==3.13.0
+idna==2.10
+ipykernel==6.25.1
+ipython==8.14.0
+Jinja2==3.1.2
+keras==3.9.2
+MarkupSafe==2.1.3
+ml-dtypes==0.4.1
+numpy==1.26.4
+opt-einsum==3.4.0
+pandas==2.0.2
+plotly==5.16.1
+protobuf==3.20.3
+pylint==3.3.6
+pytest==8.3.5
+PyPDF2==3.0.1
+python-docx==1.1.2
+requests==2.31.0
+scikit-learn==1.3.0
+six==1.16.0
+statsmodels==0.14.4
+streamlit==1.41.0
+tensorboard==2.18.0
+tensorboard-data-server==0.7.2
+tensorboard-plugin-wit==1.8.1
+tensorflow==2.18.0
+tensorflow-io-gcs-filesystem==0.31.0
+typing_extensions==4.12.2
+urllib3==1.26.15
+Werkzeug==3.0.5
+wrapt==1.15.0
+yfinance==0.2.55
diff --git a/src/PredictiveAnalysis2/Backend/data_utils.py b/src/PredictiveAnalysis2/Backend/data_utils.py
@@ -0,0 +1,39 @@
+"""
+data_utils.py file for Streamlit application
+"""
+
+import pandas as pd
+import yfinance as yf
+import streamlit as st
+
+def load_full_data(file):
+    """Load and preprocess historical stock data from CSV"""
+    data_frame = pd.read_csv(file, parse_dates=['Date'], dayfirst=True)
+    data_frame.columns = data_frame.columns.str.strip()
+    data_frame['Date'] = pd.to_datetime(data_frame['Date'], format='%d-%m-%Y', errors='coerce')
+    data_frame = data_frame.dropna(subset=['Date']).sort_values('Date')
+    data_frame.set_index('Date', inplace=True)
+
+    numeric_cols = ['Open', 'High', 'Low', 'Close', 'Volume']
+    data_frame[numeric_cols] = data_frame[numeric_cols].apply(pd.to_numeric, errors='coerce')
+    data_frame = data_frame.dropna(subset=numeric_cols)
+
+    return data_frame[['Stock Name'] + numeric_cols]
+
+@st.cache_data(ttl=300)
+def get_realtime_data():
+    """Fetch recent weekly NIFTY50 data"""
+    ticker_data = yf.Ticker("^NSEI")
+    return ticker_data.history(period="5y", interval="1wk")
+
+
+@st.cache_data(ttl=300)
+def get_realtime_daily_data():
+    """Fetch recent daily NIFTY50 data"""
+    ticker_data = yf.Ticker("^NSEI")
+    return ticker_data.history(period="2y", interval="1d")
+
+@st.cache_data
+def load_and_cache_file(uploaded_file):
+    """Cache uploaded CSV file content"""
+    return load_full_data(uploaded_file)
diff --git a/src/PredictiveAnalysis2/Backend/idx_to_label.pkl b/src/PredictiveAnalysis2/Backend/idx_to_label.pkl
diff --git a/src/PredictiveAnalysis2/Backend/max_len.pkl b/src/PredictiveAnalysis2/Backend/max_len.pkl
@@ -0,0 +1 @@
+�K(.
diff --git a/src/PredictiveAnalysis2/Backend/model_utils.py b/src/PredictiveAnalysis2/Backend/model_utils.py
@@ -0,0 +1,65 @@
+"""
+model_utils.py file for Streamlit application
+"""
+
+import numpy as np
+from tensorflow import keras
+
+def create_sequences(data, sequence_length, feature_index):
+    """Create LSTM sequences from time series data"""
+    x_seq, y_seq = [], []
+    for i in range(len(data) - sequence_length):
+        window = data[i:i + sequence_length]
+        target = data[i + sequence_length, feature_index]
+        if not np.any(np.isnan(window)) and not np.isnan(target):
+            x_seq.append(window)
+            y_seq.append(target)
+    return np.array(x_seq), np.array(y_seq)
+
+
+def build_lstm_model(input_shape):
+    """Build a simple stacked LSTM model"""
+    keras.backend.clear_session()
+    model = keras.Sequential([
+        keras.layers.LSTM(50, return_sequences=True, input_shape=input_shape),
+        keras.layers.Dropout(0.2),
+        keras.layers.LSTM(50, return_sequences=False),
+        keras.layers.Dropout(0.2),
+        keras.layers.Dense(25),
+        keras.layers.Dense(1)
+    ])
+    model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.001), loss='mean_squared_error')
+    return model
+
+
+def build_bidirectional_lstm(sequence_length):
+    """Build improved bidirectional LSTM model"""
+    model = keras.Sequential([
+        keras.layers.Bidirectional(
+            keras.layers.LSTM(128, return_sequences=True),
+            input_shape=(sequence_length, 1)),
+        keras.layers.Dropout(0.3),
+        keras.layers.Bidirectional(keras.layers.LSTM(64)),
+        keras.layers.Dropout(0.3),
+        keras.layers.Dense(32, activation='relu'),
+        keras.layers.Dense(1)
+    ])
+    model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.001), loss='mean_squared_error')
+    return model
+
+
+def rolling_forecast(model, normalized_data, sequence_length):
+    """Perform rolling forecast using trained model"""
+    predictions = []
+    if len(normalized_data) < sequence_length:
+        return predictions
+
+    last_sequence = normalized_data[:sequence_length].reshape(1, sequence_length, -1)
+
+    for i in range(sequence_length, len(normalized_data)):
+        prediction = model.predict(last_sequence, verbose=0)[0][0]
+        predictions.append(prediction)
+        next_input = normalized_data[i].reshape(1, 1, -1)
+        last_sequence = np.append(last_sequence[:, 1:, :], next_input, axis=1)
+
+    return predictions
diff --git a/src/PredictiveAnalysis2/Backend/ner_model.h5 b/src/PredictiveAnalysis2/Backend/ner_model.h5
diff --git a/src/PredictiveAnalysis2/Backend/tokenizer.pkl b/src/PredictiveAnalysis2/Backend/tokenizer.pkl
diff --git a/src/PredictiveAnalysis2/Backend/train_model.py b/src/PredictiveAnalysis2/Backend/train_model.py
@@ -0,0 +1,129 @@
+""" Program that does the model training """
+
+import pickle
+import random
+import re
+
+import numpy as np
+from faker import Faker
+from keras.layers import Bidirectional, Dense, Embedding, Input, LSTM, TimeDistributed
+from keras.models import Model
+from keras.preprocessing.sequence import pad_sequences
+from keras.preprocessing.text import Tokenizer
+from keras.utils import to_categorical
+
+# Initialize Faker
+fake = Faker()
+
+# Generate synthetic appeal letters
+def generate_appeal_letters(n_samples=500):
+    """Generate synthetic appeal letters with entities."""
+    claim_numbers = [f"CLM{random.randint(100000, 999999)}" for _ in range(50)]
+    denial_reasons = [
+        "Service not covered", "Pre-authorization required", "Out of network provider",
+        "Insufficient documentation", "Medical necessity not established"
+    ]
+    health_plans = ["BlueCross BlueShield", "United Healthcare", "Aetna", "Cigna", "Medicare"]
+
+    letters_list, annotations_list = [], []
+
+    for _ in range(n_samples):
+        claim_num = random.choice(claim_numbers)
+        reason = random.choice(denial_reasons)
+        doctor = f"Dr. {fake.name()}"
+        plan = random.choice(health_plans)
+
+        appeal_letter = (
+            f"Dear Sir/Madam, I am writing to appeal the denial of claim {claim_num}. "
+            f'The reason provided was "{reason}". '
+            f'The treatment was provided by {doctor} under {plan}. '
+            f"Please reconsider this decision. Sincerely, {fake.name()}"
+        )
+
+        entities = [(claim_num, "CLAIM"), (reason, "REASON"), (doctor, "DOCTOR"), (plan, "PLAN")]
+        letters_list.append(appeal_letter)
+        annotations_list.append(entities)
+
+    return letters_list, annotations_list
+
+# Generate dataset
+letters_data, annotations_data = generate_appeal_letters(500)
+
+# Tokenization
+tokenizer = Tokenizer(oov_token="OOV")
+tokenizer.fit_on_texts(letters_data)
+word_index = tokenizer.word_index
+total_vocab_size = len(word_index) + 1
+
+# Label mappings
+label_to_idx = {
+    "O": 0, "B-CLAIM": 1, "I-CLAIM": 2,
+    "B-REASON": 3, "I-REASON": 4,
+    "B-DOCTOR": 5, "I-DOCTOR": 6,
+    "B-PLAN": 7, "I-PLAN": 8
+}
+idx_to_label = {i: l for l, i in label_to_idx.items()}
+
+# Preprocessing
+X_data, y_data = [], []
+max_sequence_len = max(len(text.split()) for text in letters_data)
+
+for text, annotation in zip(letters_data, annotations_data):
+    words = text.split()
+    x = [word_index.get(w, 1) for w in words]  # 1 = OOV
+    labels = ["O"] * len(words)
+
+    for entity, tag in annotation:
+        entity_words = entity.split()
+        entity_words_cleaned = [re.sub(r"\W+", "", w).lower() for w in entity_words]
+
+        for i in range(len(words) - len(entity_words) + 1):
+            window = words[i:i + len(entity_words)]
+            window_cleaned = [re.sub(r"\W+", "", w).lower() for w in window]
+
+            if window_cleaned == entity_words_cleaned:
+                labels[i] = f"B-{tag}"
+                for j in range(1, len(entity_words)):
+                    labels[i + j] = f"I-{tag}"
+
+    y_seq = [label_to_idx[l] for l in labels]
+    X_data.append(x)
+    y_data.append(y_seq)
+
+X_data = pad_sequences(X_data, maxlen=max_sequence_len, padding='post')
+y_data = pad_sequences(y_data, maxlen=max_sequence_len, padding='post')
+y_data = np.array([to_categorical(i, num_classes=len(label_to_idx)) for i in y_data])
+
+# Build model
+def build_bilstm_model(vocab, num_classes, max_len):
+    """Build and return a BiLSTM model for NER."""
+    inputs = Input(shape=(max_len,))
+    embedding = Embedding(input_dim=vocab, output_dim=100)(inputs)
+    lstm = Bidirectional(LSTM(
+        128,
+        return_sequences=True,
+        dropout=0.5,
+        recurrent_dropout=0.5))(embedding)
+    dense = TimeDistributed(Dense(num_classes, activation='softmax'))(lstm)
+    model = Model(inputs, dense)
+    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
+    return model
+
+# Train and save
+model_instance = build_bilstm_model(total_vocab_size, len(label_to_idx), max_sequence_len)
+model_instance.fit(
+    X_data[:400], y_data[:400],
+    validation_data=(X_data[400:], y_data[400:]),
+    epochs=10, batch_size=32
+)
+
+model_instance.save("ner_model.h5")
+
+with open("tokenizer.pkl", "wb") as f:
+    pickle.dump(tokenizer, f)
+
+with open("idx_to_label.pkl", "wb") as f:
+    pickle.dump(idx_to_label, f)
+
+with open("max_len.pkl", "wb") as f:
+    pickle.dump(max_sequence_len, f)
Original file line number	Diff line number	Diff line change
Expand Up		@@ -172,3 +172,4 @@ cython_debug/

		# PyPI configuration file
		.pypirc
		.DS_Store