JobAds/3. JobAnalysis.sql at main · ClaudioPoli/JobAds · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
--Lista del salario medio per ogni area per la professione più richiesta
CREATE VIEW avgSalaries as
SELECT m.Area,
       cast(AVG(sl.payperc90 + sl.payperc10 / 2) as decimal(10, 3)) as SalaryAverage,
       mode() within group (order by (s.header).jobtitle)           as job
from section as s
         join mapAreas as m
              on st_within(s.position, m.loc)
         join salarieslist sl on (s.salaries).salary = sl.listingid
group by m.Area
order by SalaryAverage desc;

SELECT Area, SalaryAverage, job as bestJob
FROM avgSalaries;

--Lista del salario medio per ogni area rispetto ad una professione specifica
--Professioni: Data Engineer,Project Manager,Data Scientist,Business Analyst
SELECT m.Area, cast(AVG(sl.payperc90 + sl.payperc10 / 2) as decimal(10, 3)) as SalaryAnnualAverage
from section as s
         join mapAreas as m
              on st_within(s.position, m.loc)
         join salarieslist sl on (s.salaries).salary = sl.listingid
where (s.header).jobtitle LIKE '%Data Scientist%'
group by m.Area
order by SalaryAnnualAverage desc;

--Aree confinanti che condividono la ricerca di una posizione lavorativa
CREATE VIEW relJobAreas AS
(
SELECT row_number() over (order by m.Area)                as id,
       m.Area                                             as Area1,
       mm.Area                                            as Area2,
       mode() within group (order by (s.header).jobtitle) as job
from section as s
         join mapAreas as m
              on st_within(s.position, m.loc)
         join mapAreas as mm
              on st_within(s.position, mm.loc)
where m.Area <> mm.Area
  and st_intersects(m.loc, mm.loc) = true
group by m.Area, mm.Area);

select a.Area1,a.Area2,a.job
from relJobAreas a,
     reljobareas b
WHERE a.id < b.id
  AND a.job = b.job;

--Trasformazione delle jobDescriptions testuali
select (s.header).jobtitle,(s.job).jobdescription,
       regexp_replace(regexp_replace((s.job).jobdescription, '\<.*?\>', '', 'g'), '\n', '', 'g')
from section s
         join mapAreas as m
              on st_within(s.position, m.loc)
         join avgsalaries a on m.area = a.Area
where (s.header).jobtitle = (select a.job from avgSalaries a limit 1)
  and m.area = (select a.Area from avgSalaries a limit 1);

CREATE FUNCTION descr_to_txt(n integer, job text, area text) RETURNS VOID
AS
$$
DECLARE
    count int := 0;
BEGIN
    WHILE count <> n
        LOOP
            EXECUTE format(
                    'COPY (select regexp_replace(regexp_replace((s.job).jobdescription,''\<.*?\>'','' '',''g''),''\n'','' '',''g'') from section s join mapAreas as m on st_within(s.position,m.loc) join avgsalaries a on m.area=a.Area where (s.header).jobtitle=%L and m.area=%L offset %L limit 1) to %L',
                    job, area, count, 'C:\Users\Public\JobDesctiption\JobDescr' || count || '.txt');
            count := count + 1;
        END LOOP;
    RETURN;
END
$$
    LANGUAGE plpgsql;


SELECT descr_to_txt((select cast(count(*) as int)
                     from section s
                              join mapAreas as m
                                   on st_within(s.position, m.loc)
                              join avgsalaries a on m.area = a.Area
                     where (s.header).jobtitle = (select a.job from avgSalaries a limit 1)
                       and m.area = (select a.Area from avgSalaries a limit 1)),
                    (select a.job from avgSalaries a limit 1), (select a.Area from avgSalaries a limit 1));


--Load dei dati dai csv esportati da GATE
COPY TfIdftable (term, typology, lang, tfidf, tfidfraw, tf, locdocfr, refdocfr) FROM 'C:\Users\Public\JobDesctiption\TfIdf.csv' DELIMITER ',' CSV HEADER;
COPY Annotationtable (term, typology, lang, tfidfaug, tfidfaugra, tf, locdocfr) FROM 'C:\Users\Public\JobDesctiption\Annotation.csv' DELIMITER ',' CSV HEADER;
COPY Hyponymytable (term, typology, lang, kdomRel, kdomRelra, tf, hypCount, locdocfr) FROM 'C:\Users\Public\JobDesctiption\Hyponymy.csv' DELIMITER ',' CSV HEADER;

INSERT INTO jobdescription(tfidf, annotation, hyponymy)
SELECT cast((t.term, t.lang, t.typology, t.tfidf, t.tfidfraw, t.tf, t.locdocfr, t.refdocfr) as TfIdf),
       cast((a.term, a.lang, a.typology, a.tfidfaug, a.tfidfaugra, a.tf, a.locdocfr) as Annotation),
       cast((h.term, h.lang, h.typology, h.kdomRel, h.kdomRelra, h.tf, h.hypCount, h.locdocfr) as Hyponymy)
FROM tfidftable t
         join annotationtable a on a.term = t.term
         join hyponymytable h on h.term = t.term;

--Termine e score associati
select (j.tfidf).term,(j.tfidf).tfidf,(j.annotation).tfidfaug,(j.tfidf).tf
from jobdescription j
where (j.tfidf).typology='MultiWord'
order by tfidfaug desc;

truncate table tfidftable;
truncate table annotationtable;
truncate table hyponymytable;