-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathparsing.sql
More file actions
172 lines (150 loc) · 9.66 KB
/
parsing.sql
File metadata and controls
172 lines (150 loc) · 9.66 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
CREATE OR REPLACE FUNCTION gn_imports.transform_ff_username(username text)
RETURNS text
LANGUAGE plpgsql
IMMUTABLE
STRICT
AS $function$
--JB Desbas, Picardie Nature. 2020-12-15. Permet de transformer les noms FF (Prénom Nom)
-- en noms "SINP" (NOM Prénom). Gèere aussi les "Machine et Machine Dupont" ou "Machin & Machine Dupont"
BEGIN
RETURN (SELECT replace(btrim(concat_ws(' '::text, upper(x.o[2]), x.o[1])), 'auteur non diffusé'::text, 'ANONYME'::text) AS replace
FROM ( SELECT regexp_matches(username, '^([^\s]+(?:(?:\set\s|\s?&\s?)[^\s]+)?)\s*(.*)$'::text) AS o) AS x);
END;
$function$
;
CREATE OR REPLACE FUNCTION remove_parentheses(_input_text TEXT, _typo text[] DEFAULT NULL )
RETURNS text
LANGUAGE plpgsql
IMMUTABLE
AS $function$
--JB Desbas, Picardie Nature. 2021-09-07.
--Supprimer les parenthesis (), brackets [] et braces {} ainsi que leur contenu.
--_typo (text[]) : liste de delimiteurs à utiliser. Laisser NULL pour tous. Valeurs possibles : 'brackets','[]', 'braces', '{}', 'parentheses', '()'
--Pour des raisons techniques, la fonction nettoie aussi les doubles espaces en bout de chaine (trim) et à l'intérieur
--Supporte les imbrications jusqu'à 3 niveau (niveau1(niveau2(niveau3)))
DECLARE
_text_output text;
_invalids_args TEXT[];
_typo_possible_values TEXT[] := ARRAY['brackets','[]','parentheses','()','braces','{}'];
BEGIN
_text_output = _input_text;
SELECT INTO _invalids_args array_agg(a.v) AS invalid_args FROM
(SELECT UNNEST(_typo) AS v EXCEPT SELECT unnest(_typo_possible_values) AS v) a;
IF _invalids_args IS NOT NULL
THEN RAISE EXCEPTION 'Invalid arg : %' , _invalids_args
USING HINT = 'please use only ' || _typo_possible_values::text;
END IF;
IF ARRAY['brackets','[]'] && _typo IS NOT FALSE
THEN _text_output = regexp_replace(_text_output, '\[(?:[^\]\[]|\[(?:[^\]\[]|\[[^\]\[]*\])*\])*\]', ' ', 'g');
END IF;
IF ARRAY['parentheses','()'] && _typo IS NOT FALSE
THEN _text_output = regexp_replace(_text_output, '\((?:[^)(]|\((?:[^)(]|\([^)(]*\))*\))*\)' , ' ', 'g');
END IF;
IF ARRAY['braces','{}'] && _typo IS NOT FALSE
THEN _text_output = regexp_replace(_text_output, '\{(?:[^\}\{]|\{(?:[^\}\{]|\{[^\}\{]*\})*\})*\}' , ' ', 'g');
END IF;
RETURN btrim(regexp_replace(_text_output, '\s+', ' ', 'g'));
END;
$function$
;
CREATE OR REPLACE FUNCTION entity2char(t text)
RETURNS text
LANGUAGE plpgsql
IMMUTABLE
STRICT
AS $function$
-- Jean-Baptiste DESBAS, Picardie Nature. 2021-11-16
-- Permet de décoder les codes accent HTML (ex : "À"), présents au sein d'un texte, en charactères
-- Adapted from https://stackoverflow.com/a/14985946/10995624
declare
r record;
chars jsonb;
BEGIN
chars = '[{"AElig" : "Æ"}, {"Aacute" : "Á"}, {"Acirc" : "Â"}, {"Agrave" : "À"}, {"Alpha" : "Α"}, {"Aring" : "Å"}, {"Atilde" : "Ã"}, {"Auml" : "Ä"}, {"Beta" : "Β"}, {"Ccedil" : "Ç"}, {"Chi" : "Χ"}, {"Dagger" : "‡"}, {"Delta" : "Δ"}, {"ETH" : "Ð"}, {"Eacute" : "É"}, {"Ecirc" : "Ê"}, {"Egrave" : "È"}, {"Epsilon" : "Ε"}, {"Eta" : "Η"}, {"Euml" : "Ë"}, {"Gamma" : "Γ"}, {"Iacute" : "Í"}, {"Icirc" : "Î"}, {"Igrave" : "Ì"}, {"Iota" : "Ι"}, {"Iuml" : "Ï"}, {"Kappa" : "Κ"}, {"Lambda" : "Λ"}, {"Mu" : "Μ"}, {"Ntilde" : "Ñ"}, {"Nu" : "Ν"}, {"OElig" : "Œ"}, {"Oacute" : "Ó"}, {"Ocirc" : "Ô"}, {"Ograve" : "Ò"}, {"Omega" : "Ω"}, {"Omicron" : "Ο"}, {"Oslash" : "Ø"}, {"Otilde" : "Õ"}, {"Ouml" : "Ö"}, {"Phi" : "Φ"}, {"Pi" : "Π"}, {"Prime" : "″"}, {"Psi" : "Ψ"}, {"Rho" : "Ρ"}, {"Scaron" : "Š"}, {"Sigma" : "Σ"}, {"THORN" : "Þ"}, {"Tau" : "Τ"}, {"Theta" : "Θ"}, {"Uacute" : "Ú"}, {"Ucirc" : "Û"}, {"Ugrave" : "Ù"}, {"Upsilon" : "Υ"}, {"Uuml" : "Ü"}, {"Xi" : "Ξ"}, {"Yacute" : "Ý"}, {"Yuml" : "Ÿ"}, {"Zeta" : "Ζ"}, {"aacute" : "á"}, {"acirc" : "â"}, {"acute" : "´"}, {"aelig" : "æ"}, {"agrave" : "à"}, {"alefsym" : "ℵ"}, {"alpha" : "α"}, {"amp" : "&"}, {"and" : "∧"}, {"ang" : "∠"}, {"aring" : "å"}, {"asymp" : "≈"}, {"atilde" : "ã"}, {"auml" : "ä"}, {"bdquo" : "„"}, {"beta" : "β"}, {"brvbar" : "¦"}, {"bull" : "•"}, {"cap" : "∩"}, {"ccedil" : "ç"}, {"cedil" : "¸"}, {"cent" : "¢"}, {"chi" : "χ"}, {"circ" : "ˆ"}, {"clubs" : "♣"}, {"cong" : "≅"}, {"copy" : "©"}, {"crarr" : "↵"}, {"cup" : "∪"}, {"curren" : "¤"}, {"dArr" : "⇓"}, {"dagger" : "†"}, {"darr" : "↓"}, {"deg" : "°"}, {"delta" : "δ"}, {"diams" : "♦"}, {"divide" : "÷"}, {"eacute" : "é"}, {"ecirc" : "ê"}, {"egrave" : "è"}, {"empty" : "∅"}, {"emsp" : " "}, {"ensp" : " "}, {"epsilon" : "ε"}, {"equiv" : "≡"}, {"eta" : "η"}, {"eth" : "ð"}, {"euml" : "ë"}, {"euro" : "€"}, {"exist" : "∃"}, {"fnof" : "ƒ"}, {"forall" : "∀"}, {"frac12" : "½"}, {"frac14" : "¼"}, {"frac34" : "¾"}, {"frasl" : "⁄"}, {"gamma" : "γ"}, {"ge" : "≥"}, {"gt" : ">"}, {"hArr" : "⇔"}, {"harr" : "↔"}, {"hearts" : "♥"}, {"hellip" : "…"}, {"iacute" : "í"}, {"icirc" : "î"}, {"iexcl" : "¡"}, {"igrave" : "ì"}, {"image" : "ℑ"}, {"infin" : "∞"}, {"int" : "∫"}, {"iota" : "ι"}, {"iquest" : "¿"}, {"isin" : "∈"}, {"iuml" : "ï"}, {"kappa" : "κ"}, {"lArr" : "⇐"}, {"lambda" : "λ"}, {"lang" : "〈"}, {"laquo" : "«"}, {"larr" : "←"}, {"lceil" : "⌈"}, {"ldquo" : "“"}, {"le" : "≤"}, {"lfloor" : "⌊"}, {"lowast" : "∗"}, {"loz" : "◊"}, {"lrm" : ""}, {"lsaquo" : "‹"}, {"lsquo" : "‘"}, {"lt" : "<"}, {"macr" : "¯"}, {"mdash" : "—"}, {"micro" : "µ"}, {"middot" : "·"}, {"minus" : "−"}, {"mu" : "μ"}, {"nabla" : "∇"}, {"nbsp" : " "}, {"ndash" : "–"}, {"ne" : "≠"}, {"ni" : "∋"}, {"not" : "¬"}, {"notin" : "∉"}, {"nsub" : "⊄"}, {"ntilde" : "ñ"}, {"nu" : "ν"}, {"oacute" : "ó"}, {"ocirc" : "ô"}, {"oelig" : "œ"}, {"ograve" : "ò"}, {"oline" : "‾"}, {"omega" : "ω"}, {"omicron" : "ο"}, {"oplus" : "⊕"}, {"or" : "∨"}, {"ordf" : "ª"}, {"ordm" : "º"}, {"oslash" : "ø"}, {"otilde" : "õ"}, {"otimes" : "⊗"}, {"ouml" : "ö"}, {"para" : "¶"}, {"part" : "∂"}, {"permil" : "‰"}, {"perp" : "⊥"}, {"phi" : "φ"}, {"pi" : "π"}, {"piv" : "ϖ"}, {"plusmn" : "±"}, {"pound" : "£"}, {"prime" : "′"}, {"prod" : "∏"}, {"prop" : "∝"}, {"psi" : "ψ"}, {"quot" : "\""}, {"rArr" : "⇒"}, {"radic" : "√"}, {"rang" : "〉"}, {"raquo" : "»"}, {"rarr" : "→"}, {"rceil" : "⌉"}, {"rdquo" : "”"}, {"real" : "ℜ"}, {"reg" : "®"}, {"rfloor" : "⌋"}, {"rho" : "ρ"}, {"rlm" : ""}, {"rsaquo" : "›"}, {"rsquo" : "’"}, {"sbquo" : "‚"}, {"scaron" : "š"}, {"sdot" : "⋅"}, {"sect" : "§"}, {"shy" : ""}, {"sigma" : "σ"}, {"sigmaf" : "ς"}, {"sim" : "∼"}, {"spades" : "♠"}, {"sub" : "⊂"}, {"sube" : "⊆"}, {"sum" : "∑"}, {"sup" : "⊃"}, {"sup1" : "¹"}, {"sup2" : "²"}, {"sup3" : "³"}, {"supe" : "⊇"}, {"szlig" : "ß"}, {"tau" : "τ"}, {"there4" : "∴"}, {"theta" : "θ"}, {"thetasym" : "ϑ"}, {"thinsp" : " "}, {"thorn" : "þ"}, {"tilde" : "˜"}, {"times" : "×"}, {"trade" : "™"}, {"uArr" : "⇑"}, {"uacute" : "ú"}, {"uarr" : "↑"}, {"ucirc" : "û"}, {"ugrave" : "ù"}, {"uml" : "¨"}, {"upsih" : "ϒ"}, {"upsilon" : "υ"}, {"uuml" : "ü"}, {"weierp" : "℘"}, {"xi" : "ξ"}, {"yacute" : "ý"}, {"yen" : "¥"}, {"yuml" : "ÿ"}, {"zeta" : "ζ"}, {"zwj" : ""}, {"zwnj" : ""}]'::jsonb;
for r in
select distinct ce.ch, ce.name
from
(SELECT (e)."key" AS "name", (e).value AS ch FROM ( SELECT jsonb_each_text(jsonb_array_elements(chars)) AS e ) a ) AS ce
inner join (
select name[1] "name"
from regexp_matches(t, '&([A-Za-z]+?);', 'g') r(name)
) s on ce."name" = s.name
loop
t := replace(t, '&' || r.name || ';', r.ch);
end loop;
for r in
select distinct
hex[1] hex,
('x' || repeat('0', 8 - length(hex[1])) || hex[1])::bit(32)::int codepoint
from regexp_matches(t, '&#x([0-9a-f]{1,8}?);', 'gi') s(hex)
loop
t := regexp_replace(t, '&#x' || r.hex || ';', chr(r.codepoint), 'gi');
end loop;
for r in
select distinct
chr(codepoint[1]::int) ch,
codepoint[1] codepoint
from regexp_matches(t, '&#([0-9]{1,10}?);', 'g') s(codepoint)
loop
t := replace(t, '&#' || r.codepoint || ';', r.ch);
end loop;
return t;
end;
$function$
;
CREATE OR REPLACE FUNCTION orcid_is_valid(_string text)
RETURNS boolean
LANGUAGE plpgsql
IMMUTABLE
STRICT
AS $function$
DECLARE el TEXT;
DECLARE checksum int;
DECLARE checksum_s TEXT;
BEGIN
checksum = 0;
IF _string ~ '^[0-9]{15}[0-9|X]$' IS NOT TRUE THEN
RAISE NOTICE '% is not a valid ORCID. Must be a 16 digits number', _string;
RETURN FALSE;
END IF;
/**Checksum calculation**/
FOREACH el IN ARRAY regexp_split_to_array(LEFT(_string,15), '')
LOOP
checksum = (el::int + checksum)*2;
END LOOP;
checksum = 12 - (checksum % 11) % 11;
IF checksum = 10 THEN checksum_s = 'X'; --MAP 10 TO X
ELSE checksum_s = checksum::int;
END IF;
IF checksum_s != right(_string,1) THEN
RAISE NOTICE 'Invalid checksum for ORCID %',_string;
RETURN FALSE;
END IF;
RETURN TRUE;
END;
$function$;
--CREATE DOMAIN orcid AS text CHECK(orcid_is_valid(VALUE));
CREATE OR REPLACE FUNCTION int_or_null(_myinput text)
RETURNS int
LANGUAGE SQL
IMMUTABLE
STRICT
AS $function$
--Jean-Baptiste DESBAS. 2022-04-07
--Retourne un INTEGER celui-ci peut être casté, sinon retourne NULL.
SELECT CASE WHEN (btrim(_myinput::TEXT) ~ '^\d+$'::text) IS NOT TRUE THEN NULL::integer ELSE _myinput::integer END;
$function$;
CREATE OR REPLACE FUNCTION uuid_or_null(str text)
RETURNS uuid
LANGUAGE plpgsql
IMMUTABLE
STRICT
AS $$
--https://stackoverflow.com/a/46433640/10995624
--Add IMMUTABLE STRICT
BEGIN
RETURN str::uuid;
EXCEPTION WHEN invalid_text_representation THEN
RETURN NULL;
END;
$$;