Skip to content

Commit b646b80

Browse files
Add files via upload
1 parent 2233079 commit b646b80

File tree

5 files changed

+1383
-0
lines changed

5 files changed

+1383
-0
lines changed

CombineDFwithThreads.py

Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
import pandas as pd
2+
import numpy as np
3+
import threading
4+
5+
def read_dataframe(file_path):
6+
# Read the data frame from the file
7+
df = pd.read_csv(file_path) # Modify this according to your file format
8+
return df
9+
10+
11+
def combine_dataframes(df_list):
12+
# Combine the data frames into a single data frame
13+
combined_df = pd.concat(df_list, ignore_index=True) # Modify this according to your combination logic
14+
15+
return combined_df
16+
17+
18+
def func1(df):
19+
# Perform operations on the combined data frame
20+
# ...
21+
pass
22+
23+
24+
def func2(df):
25+
# Perform operations on the combined data frame
26+
# ...
27+
pass
28+
29+
def return_df(df):
30+
return df
31+
32+
df3 = pd.DataFrame(np.random.randint(0,100, size = (1000,108)))
33+
# Create the first dataframe
34+
df1 = pd.DataFrame(np.random.randint(0, 100, size=(1000, 108)))
35+
36+
# Create the second dataframe
37+
df2 = pd.DataFrame(np.random.randint(0, 100, size=(1000, 108)))
38+
39+
df4 = pd.DataFrame(np.random.randint(0,100,size = (1000,108)))
40+
41+
42+
43+
# List to store the individual data frames
44+
df_list = []
45+
46+
# List to store combined data frames
47+
combined_df = []
48+
49+
# List to store the individual data frames
50+
data_frames = [df1,df3,df4,df4]
51+
52+
# Create threads for reading the data frames
53+
threads = []
54+
for data_frame in data_frames:
55+
thread = threading.Thread(target=lambda path: df_list.append(return_df(data_frame)), args=(data_frame,))
56+
threads.append(thread)
57+
thread.start()
58+
59+
# Wait for all threads to finish
60+
for thread in threads:
61+
thread.join()
62+
63+
# Create a thread for combining the data frames
64+
combine_thread = threading.Thread(target=lambda: combined_df.append(combine_dataframes(df_list)))
65+
combine_thread.start()
66+
combine_thread.join()
67+
68+
print(df_list)
69+
print(combined_df)
70+
# Create threads for calling different functions on the combined data frame
71+
func1_thread = threading.Thread(target=lambda: func1(combined_df))
72+
func2_thread = threading.Thread(target=lambda: func2(combined_df))
73+
74+
# Start the function threads
75+
func1_thread.start()
76+
func2_thread.start()
77+
78+
# Wait for the function threads to finish
79+
func1_thread.join()
80+
func2_thread.join()
81+
82+
# The functions func1 and func2 have been called on the combined data frame

EDA.py

Lines changed: 299 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,299 @@
1+
import pandas as pd
2+
import time
3+
import warnings
4+
import matplotlib.pyplot as plt
5+
import seaborn as sns
6+
import numpy as np
7+
import numpy as np
8+
import re
9+
import pandas as pd
10+
import random
11+
import glob
12+
import ReadFiles as rf
13+
import os
14+
15+
warnings.filterwarnings("ignore")
16+
17+
18+
def draw_plots(df):
19+
20+
columns = [f"Feature_{i}" for i in range(df.shape[1])]
21+
22+
# Plot a histogram for one of the features using Matplotlib
23+
plt.figure(figsize=(10, 6))
24+
plt.hist(df['$startdate'], bins=100, color='blue', alpha=0.7)
25+
plt.title('Histogram of Feature 0')
26+
plt.xlabel('Feature Value')
27+
plt.ylabel('Frequency')
28+
plt.show()
29+
30+
# Plot a histogram for one of the features using Matplotlib
31+
plt.figure(figsize=(10, 6))
32+
plt.hist(df['$retdate'], bins=100, color='blue', alpha=0.7)
33+
plt.title('Histogram of Feature 0')
34+
plt.xlabel('Feature Value')
35+
plt.ylabel('Frequency')
36+
plt.show()
37+
38+
# Plot a scatter plot between two features using Seaborn
39+
plt.figure(figsize=(10, 6))
40+
sns.scatterplot(x= df['$startdate'], y= df['$disk_files'], data=df, alpha=0.5)
41+
plt.title('Scatter Plot between Feature 1 and Feature 2')
42+
plt.xlabel('Feature 1 Value')
43+
plt.ylabel('Feature 2 Value')
44+
plt.show()
45+
46+
47+
def feature_count(df):
48+
# Get the number of features
49+
n_features = len(df.columns)
50+
51+
# Iterate over the features
52+
for i in range(0, n_features, 2):
53+
# Get the features in the current iteration
54+
features_i = df.columns[i:i + 2]
55+
56+
# Create a plot
57+
fig, ax = plt.subplots()
58+
59+
# Plot the frequency of each feature in the plot
60+
df[features_i].value_counts().plot(kind='bar', ax=ax)
61+
ax.set_title('Frequency of Features {}'.format(i))
62+
63+
# Show the plot
64+
plt.tight_layout()
65+
plt.show()
66+
67+
def queries_by_date(df):
68+
69+
# Group the data by 'date' and count the number of queries for each day
70+
daily_query_counts = df['$date'].value_counts().sort_index()
71+
72+
# Create a plot
73+
plt.figure(figsize=(10, 6))
74+
plt.plot(daily_query_counts.index, daily_query_counts.values, marker='o', linestyle='-', color='b')
75+
plt.title('Number of Queries Generated Each Day')
76+
plt.xlabel('Date')
77+
plt.ylabel('Number of Queries')
78+
plt.xticks(rotation=45)
79+
plt.grid(True)
80+
81+
# Show the plot
82+
plt.tight_layout()
83+
plt.show()
84+
85+
def queries_by_verb(df):
86+
87+
# Group the data by 'date' and count the number of queries for each day
88+
daily_query_counts = df['$verb'].value_counts().sort_index()
89+
90+
# Create a plot
91+
plt.figure(figsize=(10, 6))
92+
plt.plot(daily_query_counts.index, daily_query_counts.values, marker='o', color='b')
93+
plt.title('Number of Queries categorised by action to be taken on the query')
94+
plt.xlabel('Verb')
95+
plt.ylabel('Number of Queries')
96+
plt.xticks(rotation=45)
97+
plt.grid(True)
98+
99+
# Show the plot
100+
plt.tight_layout()
101+
plt.show()
102+
103+
def convert_objects_to_numbers(df):
104+
105+
# Step 1: Identify columns with object data type
106+
object_columns = df.select_dtypes(include=['object']).columns
107+
108+
# Step 2: Convert each object column to numerical values
109+
for col in object_columns:
110+
unique_values = df[col].unique()
111+
mapping = {value: idx + 1 for idx, value in enumerate(unique_values)}
112+
df[col] = df[col].map(mapping).astype(np.float64)
113+
114+
# Fill missing values with 0
115+
df.fillna(0, inplace=True)
116+
# df.to_csv("/Users/anas/Documents/UoR/MSc Project/Report/Logs/output2.csv", sep=',', encoding='utf-8', index=False)
117+
118+
return df
119+
120+
def feature_correlation(df):
121+
122+
df1 = df.copy()
123+
# Convert date columns to datetime objects
124+
125+
df1['$startdate'] = pd.to_datetime(df['$startdate'])
126+
df1['$starttime'] = pd.to_datetime(df['$starttime'])
127+
128+
correlation_matrix = df1.corr()
129+
130+
# Print labels with correlations greater than 0.8
131+
high_correlation_labels = []
132+
for col in correlation_matrix.columns:
133+
correlated_cols = correlation_matrix.index[(correlation_matrix[col] > 0.8) | (correlation_matrix[col] < -0.8)].tolist()
134+
if len(correlated_cols) == 0:
135+
break
136+
else:
137+
correlated_cols.remove(col) # Remove the column itself from the list
138+
for correlated_col in correlated_cols.copy(): # Use a copy of the list to iterate
139+
if (col, correlated_col) not in high_correlation_labels and (correlated_col, col) not in high_correlation_labels:
140+
high_correlation_labels.append((col, correlated_col))
141+
print(f"Correlation > 0.8: {col} - {correlated_col} - {correlation_matrix.loc[col, correlated_col]}")
142+
143+
# Create masks for high and low correlations
144+
mask_high = np.abs(correlation_matrix) > 0.8
145+
mask_low = np.abs(correlation_matrix) < -0.7
146+
147+
# Create correlation matrices with masked values
148+
correlation_matrix_high = np.where(mask_high, correlation_matrix, np.nan)
149+
150+
# Plot the heatmap for high correlations
151+
plt.figure(figsize=(8, 6))
152+
sns.heatmap(correlation_matrix_high, annot=False, cmap='coolwarm', center=0)
153+
plt.title('Correlation Heatmap (High Correlations)')
154+
plt.xticks(range(len(correlation_matrix.columns)), correlation_matrix.columns, rotation=90)
155+
plt.yticks(range(len(correlation_matrix.columns)), correlation_matrix.columns, rotation = 0)
156+
plt.show()
157+
158+
159+
def outliers(df):
160+
161+
plt.figure(figsize=(12,8))
162+
sns.boxplot(data=df, orient='v') # 'orient' specifies horizontal orientation
163+
plt.title('Box Plots of Features')
164+
plt.xlabel('Values')
165+
plt.show()
166+
167+
# Calculate Z-scores for each feature
168+
z_scores = np.abs((df - df.mean()) / df.std())
169+
170+
# Set a threshold for outlier detection (e.g., Z-score > 3)
171+
outlier_threshold = 3
172+
173+
# Create a DataFrame of boolean values indicating outliers
174+
outliers = z_scores > outlier_threshold
175+
176+
# Summarize which features have outliers
177+
features_with_outliers = outliers.any()
178+
179+
print("Features with Outliers:", features_with_outliers[features_with_outliers == True].index)
180+
181+
182+
# Read file in data frame
183+
def read_into_df(filename):
184+
return pd.read_csv(filename)
185+
186+
# Check duplicate values
187+
def check_duplicates(df):
188+
189+
duplicated_df = df.duplicated(keep=False)
190+
191+
# Count the number of unique and duplicate rows
192+
num_unique = (~duplicated_df).sum()
193+
num_duplicates = duplicated_df.sum()
194+
195+
# Create a bar plot
196+
plt.bar(['Unique', 'Duplicate'], [num_unique, num_duplicates])
197+
plt.xlabel('Row Type')
198+
plt.ylabel('Count')
199+
plt.title('Unique vs. Duplicate Rows')
200+
201+
# Display the plot
202+
plt.show()
203+
204+
# Check null values
205+
def check_null_values(df):
206+
missing_values = df.isnull().sum()
207+
total_rows = len(df)
208+
percentage_null_values = (missing_values/total_rows)*100
209+
result = []
210+
for column, count in missing_values.items():
211+
percentage = percentage_null_values[column]
212+
result.append({'Column': column, 'Null Count': count, '% Null Values': percentage})
213+
214+
result_df = pd.DataFrame(result)
215+
result_df.to_csv('/Users/anas/Documents/UoR/MSc Project/Report/Logs/Null Values.csv', index=False)
216+
return result_df
217+
218+
# Check unique values
219+
def check_unique_values(df):
220+
result = []
221+
for column in df.columns:
222+
unique_values_count = df[column].nunique()
223+
unique_values = df[column].unique()
224+
result.append({'Column': column, 'Unique Values': unique_values_count,
225+
'List of Unique Values': unique_values })
226+
227+
228+
result_df = pd.DataFrame(result)
229+
result_df.to_csv('/Users/anas/Documents/UoR/MSc Project/Report/Logs/Column Unique Values1.csv', index=False)
230+
231+
232+
def check_missingvalues(df):
233+
234+
#Finding missing values in the dataframe
235+
missing_values = df.isnull().sum()
236+
total_values = df.count()
237+
238+
# Splitting columns into two groups
239+
num_columns = df.shape[1]
240+
half_num_columns = num_columns // 2
241+
first_half_columns = df.columns[:half_num_columns]
242+
second_half_columns = df.columns[half_num_columns:]
243+
244+
# Creating subplots
245+
fig, axes = plt.subplots(2, 1, figsize=(15, 6))
246+
247+
# Plotting for the first half of columns
248+
axes[0].bar(total_values[first_half_columns].index, total_values[first_half_columns].values, color='blue', label='Total Values')
249+
axes[0].bar(missing_values[first_half_columns].index, missing_values[first_half_columns].values, color='orange', label='Missing Values')
250+
axes[0].set_xlabel('Features')
251+
axes[0].set_ylabel('Count')
252+
axes[0].set_title('Total Values vs Missing Values')
253+
axes[0].set_xticklabels(labels=first_half_columns, rotation=90)
254+
axes[0].legend()
255+
256+
# Plotting for the second half of columns
257+
axes[1].bar(total_values[second_half_columns].index, total_values[second_half_columns].values, color='blue', label='Total Values')
258+
axes[1].bar(missing_values[second_half_columns].index, missing_values[second_half_columns].values, color='orange', label='Missing Values')
259+
axes[1].set_xlabel('Features')
260+
axes[1].set_ylabel('Count')
261+
axes[1].set_title('Total Values vs Missing Values')
262+
axes[1].set_xticklabels(labels=second_half_columns, rotation=90)
263+
axes[1].legend()
264+
265+
plt.tight_layout()
266+
plt.show()
267+
268+
return
269+
270+
def list_unique_values(df, columns):
271+
result = []
272+
for column in df.columns:
273+
unique_values = df[column].unique()
274+
result.append({'Column': column,'List of Unique Values': unique_values})
275+
276+
result_df = pd.DataFrame(result)
277+
result_df.to_csv('/Users/anas/Documents/UoR/MSc Project/Report/Logs/ListOfUniqueValues.csv', index=False)
278+
279+
if __name__ == '__main__':
280+
281+
filename = "/Users/anas/Documents/UoR/MSc Project/Report/Logs/ConvertToLog.csv"
282+
with open(filename, 'r') as file:
283+
start = time.time()
284+
chunk = pd.read_csv(filename,chunksize=1000000)
285+
end = time.time()
286+
print("Read csv with chunks: ",(end-start),"sec")
287+
start = time.time()
288+
pd_df = pd.concat(chunk)
289+
end = time.time()
290+
print("Concatenation time: ",(end-start),"sec")
291+
292+
293+
start = time.time()
294+
check_unique_values(pd_df)
295+
end = time.time()
296+
print(f'List Unique values time: {end - start}, sec')
297+
298+
299+

0 commit comments

Comments
 (0)