-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathuserDatasetCreator
More file actions
100 lines (78 loc) · 2.83 KB
/
userDatasetCreator
File metadata and controls
100 lines (78 loc) · 2.83 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import random
from faker import Faker
from pathlib import Path
import pandas as pd
cleanQDataset = pd.read_csv("Dataset\Questions_cleaned.csv", encoding = "ISO-8859-1")
aDataset = pd.read_csv("Dataset\Answers.csv", encoding = "ISO-8859-1")
domains = ["gmail.com", "hotmail.com", "livemillennium.com", "millenium.com"]
inBetween = ['_', '-', '.']
validIds = {61.0}
def generateEmail(firstName: str, lastName: str):
domainNum = random.randint(0, len(domains)-1)
breakNum = random.randint(0, len(inBetween)-1)
email = f'{firstName}{inBetween[breakNum]}{lastName}@{domains[domainNum]}'
return email
def createUserDataset():
df = pd.DataFrame()
df['Ids'] = list(validIds)
df['FirstName'] = firstNames
df['LastName'] = lastNames
df['Email'] = emails
df['JobTitle'] = jobTitles
df['Address'] = addresses
df['Status'] = activeStatus
return df
def addIds(dataframe: pd.DataFrame, column: str):
newDataframe = dataframe.copy()
newDataframe = newDataframe.dropna(subset=[column])
newDataframe = newDataframe.astype({column: 'int'})
colList = newDataframe[column].values.tolist()
newSet = set(colList)
validIds.update(newSet)
def addAnsIds():
addIds(aDataset, 'OwnerUserId')
def addQuesIds():
addIds(cleanQDataset, 'OwnerUserId')
newADataset = aDataset.copy()
newADataset = newADataset.dropna(subset=['OwnerUserId'])
newADataset = newADataset.astype({'OwnerUserId': 'int'})
newADataset
addAnsIds()
#addQuesIds()
size = len(validIds)
#size
fake = Faker()
firstNames = []
lastNames = []
emails = []
jobTitles = []
addresses = []
activeStatus = []
for i in range(size):
first = fake.first_name()
last = fake.last_name()
job = fake.job()
address = f'{fake.city()}, {fake.country()}'
firstNames.append(first)
lastNames.append(last)
emails.append(generateEmail(first, last))
jobTitles.append(job)
addresses.append(address)
activeStatus.append("Active")
dataset = createUserDataset()
#print(dataset)
datasetCount = dataset.copy()
datasetCount['count'] = datasetCount.groupby('Ids')['Ids'].transform('count')
datasetCount.sort_values('count', inplace=True, ascending=False)
#print(datasetCount)
ansScorer = aDataset.copy()
# Group the merged DataFrame by Tag and OwnerUserId and calculate the sum of the Score for each group
ansScored = ansScorer.groupby(['OwnerUserId'])['Score'].sum().reset_index()
ansScored.sort_values('Score', ascending=False, inplace=True)
ansScored.rename({'OwnerUserId': 'Ids'}, axis=1, inplace=True)
#print(ansScored)
fullDataset = pd.merge(dataset, ansScored, on='Ids', how='outer')
#print(fullDataset)
filepath = Path('Dataset/EngineersDataset.csv')
filepath.parent.mkdir(parents=True, exist_ok=True)
fullDataset.to_csv(filepath, index=False)