custom-scripts/parsing.sql at main · jbdesbas/custom-scripts · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
CREATE OR REPLACE FUNCTION gn_imports.transform_ff_username(username text)
 RETURNS text
 LANGUAGE plpgsql
 IMMUTABLE
 STRICT
AS $function$
--JB Desbas, Picardie Nature. 2020-12-15. Permet de transformer les noms FF (Prénom Nom)
-- en noms "SINP" (NOM Prénom). Gèere aussi les "Machine et Machine Dupont" ou "Machin & Machine Dupont"
BEGIN
	RETURN (SELECT replace(btrim(concat_ws(' '::text, upper(x.o[2]), x.o[1])), 'auteur non diffusé'::text, 'ANONYME'::text) AS replace
           FROM ( SELECT regexp_matches(username, '^([^\s]+(?:(?:\set\s|\s?&\s?)[^\s]+)?)\s*(.*)$'::text) AS o) AS x);
END;
$function$
;


CREATE OR REPLACE FUNCTION remove_parentheses(_input_text TEXT, _typo text[] DEFAULT NULL )
 RETURNS text
 LANGUAGE plpgsql
 IMMUTABLE
AS $function$
--JB Desbas, Picardie Nature. 2021-09-07.
--Supprimer les parenthesis (), brackets [] et braces {} ainsi que leur contenu.
--_typo (text[]) : liste de delimiteurs à utiliser. Laisser NULL pour tous. Valeurs possibles : 'brackets','[]', 'braces', '{}',  'parentheses', '()'
--Pour des raisons techniques, la fonction nettoie aussi les doubles espaces en bout de chaine (trim) et à l'intérieur
--Supporte les imbrications jusqu'à 3 niveau (niveau1(niveau2(niveau3)))
DECLARE
  _text_output text;
  _invalids_args TEXT[];
  _typo_possible_values TEXT[] := ARRAY['brackets','[]','parentheses','()','braces','{}'];
BEGIN
    _text_output = _input_text;

    SELECT INTO _invalids_args array_agg(a.v) AS invalid_args FROM
        (SELECT UNNEST(_typo) AS v EXCEPT SELECT unnest(_typo_possible_values) AS v) a;
    IF _invalids_args IS NOT NULL
        THEN RAISE EXCEPTION 'Invalid arg : %' , _invalids_args
                USING HINT = 'please use only ' || _typo_possible_values::text;
    END IF;

    IF ARRAY['brackets','[]'] && _typo IS NOT FALSE
        THEN _text_output = regexp_replace(_text_output, '\[(?:[^\]\[]|\[(?:[^\]\[]|\[[^\]\[]*\])*\])*\]', ' ', 'g');
    END IF;
    IF ARRAY['parentheses','()'] && _typo IS NOT FALSE
        THEN _text_output = regexp_replace(_text_output,  '\((?:[^)(]|\((?:[^)(]|\([^)(]*\))*\))*\)' , ' ', 'g');
    END IF;
    IF ARRAY['braces','{}'] && _typo IS NOT FALSE
    THEN _text_output = regexp_replace(_text_output, '\{(?:[^\}\{]|\{(?:[^\}\{]|\{[^\}\{]*\})*\})*\}' , ' ', 'g');
    END IF;

    RETURN btrim(regexp_replace(_text_output, '\s+', ' ', 'g'));
END;
$function$
;

CREATE OR REPLACE FUNCTION entity2char(t text)
 RETURNS text
 LANGUAGE plpgsql
 IMMUTABLE
 STRICT
	AS $function$
	-- Jean-Baptiste DESBAS, Picardie Nature. 2021-11-16
	-- Permet de décoder les codes accent HTML (ex :  "&Agrave;"), présents au sein d'un texte, en charactères
	-- Adapted from https://stackoverflow.com/a/14985946/10995624

	declare
	    r record;
	    chars jsonb;
	BEGIN
	    chars = '[{"AElig" : "Æ"}, {"Aacute" : "Á"}, {"Acirc" : "Â"}, {"Agrave" : "À"}, {"Alpha" : "Α"}, {"Aring" : "Å"}, {"Atilde" : "Ã"}, {"Auml" : "Ä"}, {"Beta" : "Β"}, {"Ccedil" : "Ç"}, {"Chi" : "Χ"}, {"Dagger" : "‡"}, {"Delta" : "Δ"}, {"ETH" : "Ð"}, {"Eacute" : "É"}, {"Ecirc" : "Ê"}, {"Egrave" : "È"}, {"Epsilon" : "Ε"}, {"Eta" : "Η"}, {"Euml" : "Ë"}, {"Gamma" : "Γ"}, {"Iacute" : "Í"}, {"Icirc" : "Î"}, {"Igrave" : "Ì"}, {"Iota" : "Ι"}, {"Iuml" : "Ï"}, {"Kappa" : "Κ"}, {"Lambda" : "Λ"}, {"Mu" : "Μ"}, {"Ntilde" : "Ñ"}, {"Nu" : "Ν"}, {"OElig" : "Œ"}, {"Oacute" : "Ó"}, {"Ocirc" : "Ô"}, {"Ograve" : "Ò"}, {"Omega" : "Ω"}, {"Omicron" : "Ο"}, {"Oslash" : "Ø"}, {"Otilde" : "Õ"}, {"Ouml" : "Ö"}, {"Phi" : "Φ"}, {"Pi" : "Π"}, {"Prime" : "″"}, {"Psi" : "Ψ"}, {"Rho" : "Ρ"}, {"Scaron" : "Š"}, {"Sigma" : "Σ"}, {"THORN" : "Þ"}, {"Tau" : "Τ"}, {"Theta" : "Θ"}, {"Uacute" : "Ú"}, {"Ucirc" : "Û"}, {"Ugrave" : "Ù"}, {"Upsilon" : "Υ"}, {"Uuml" : "Ü"}, {"Xi" : "Ξ"}, {"Yacute" : "Ý"}, {"Yuml" : "Ÿ"}, {"Zeta" : "Ζ"}, {"aacute" : "á"}, {"acirc" : "â"}, {"acute" : "´"}, {"aelig" : "æ"}, {"agrave" : "à"}, {"alefsym" : "ℵ"}, {"alpha" : "α"}, {"amp" : "&"}, {"and" : "∧"}, {"ang" : "∠"}, {"aring" : "å"}, {"asymp" : "≈"}, {"atilde" : "ã"}, {"auml" : "ä"}, {"bdquo" : "„"}, {"beta" : "β"}, {"brvbar" : "¦"}, {"bull" : "•"}, {"cap" : "∩"}, {"ccedil" : "ç"}, {"cedil" : "¸"}, {"cent" : "¢"}, {"chi" : "χ"}, {"circ" : "ˆ"}, {"clubs" : "♣"}, {"cong" : "≅"}, {"copy" : "©"}, {"crarr" : "↵"}, {"cup" : "∪"}, {"curren" : "¤"}, {"dArr" : "⇓"}, {"dagger" : "†"}, {"darr" : "↓"}, {"deg" : "°"}, {"delta" : "δ"}, {"diams" : "♦"}, {"divide" : "÷"}, {"eacute" : "é"}, {"ecirc" : "ê"}, {"egrave" : "è"}, {"empty" : "∅"}, {"emsp" : " "}, {"ensp" : " "}, {"epsilon" : "ε"}, {"equiv" : "≡"}, {"eta" : "η"}, {"eth" : "ð"}, {"euml" : "ë"}, {"euro" : "€"}, {"exist" : "∃"}, {"fnof" : "ƒ"}, {"forall" : "∀"}, {"frac12" : "½"}, {"frac14" : "¼"}, {"frac34" : "¾"}, {"frasl" : "⁄"}, {"gamma" : "γ"}, {"ge" : "≥"}, {"gt" : ">"}, {"hArr" : "⇔"}, {"harr" : "↔"}, {"hearts" : "♥"}, {"hellip" : "…"}, {"iacute" : "í"}, {"icirc" : "î"}, {"iexcl" : "¡"}, {"igrave" : "ì"}, {"image" : "ℑ"}, {"infin" : "∞"}, {"int" : "∫"}, {"iota" : "ι"}, {"iquest" : "¿"}, {"isin" : "∈"}, {"iuml" : "ï"}, {"kappa" : "κ"}, {"lArr" : "⇐"}, {"lambda" : "λ"}, {"lang" : "〈"}, {"laquo" : "«"}, {"larr" : "←"}, {"lceil" : "⌈"}, {"ldquo" : "“"}, {"le" : "≤"}, {"lfloor" : "⌊"}, {"lowast" : "∗"}, {"loz" : "◊"}, {"lrm" : "‎"}, {"lsaquo" : "‹"}, {"lsquo" : "‘"}, {"lt" : "<"}, {"macr" : "¯"}, {"mdash" : "—"}, {"micro" : "µ"}, {"middot" : "·"}, {"minus" : "−"}, {"mu" : "μ"}, {"nabla" : "∇"}, {"nbsp" : " "}, {"ndash" : "–"}, {"ne" : "≠"}, {"ni" : "∋"}, {"not" : "¬"}, {"notin" : "∉"}, {"nsub" : "⊄"}, {"ntilde" : "ñ"}, {"nu" : "ν"}, {"oacute" : "ó"}, {"ocirc" : "ô"}, {"oelig" : "œ"}, {"ograve" : "ò"}, {"oline" : "‾"}, {"omega" : "ω"}, {"omicron" : "ο"}, {"oplus" : "⊕"}, {"or" : "∨"}, {"ordf" : "ª"}, {"ordm" : "º"}, {"oslash" : "ø"}, {"otilde" : "õ"}, {"otimes" : "⊗"}, {"ouml" : "ö"}, {"para" : "¶"}, {"part" : "∂"}, {"permil" : "‰"}, {"perp" : "⊥"}, {"phi" : "φ"}, {"pi" : "π"}, {"piv" : "ϖ"}, {"plusmn" : "±"}, {"pound" : "£"}, {"prime" : "′"}, {"prod" : "∏"}, {"prop" : "∝"}, {"psi" : "ψ"}, {"quot" : "\""}, {"rArr" : "⇒"}, {"radic" : "√"}, {"rang" : "〉"}, {"raquo" : "»"}, {"rarr" : "→"}, {"rceil" : "⌉"}, {"rdquo" : "”"}, {"real" : "ℜ"}, {"reg" : "®"}, {"rfloor" : "⌋"}, {"rho" : "ρ"}, {"rlm" : "‏"}, {"rsaquo" : "›"}, {"rsquo" : "’"}, {"sbquo" : "‚"}, {"scaron" : "š"}, {"sdot" : "⋅"}, {"sect" : "§"}, {"shy" : ""}, {"sigma" : "σ"}, {"sigmaf" : "ς"}, {"sim" : "∼"}, {"spades" : "♠"}, {"sub" : "⊂"}, {"sube" : "⊆"}, {"sum" : "∑"}, {"sup" : "⊃"}, {"sup1" : "¹"}, {"sup2" : "²"}, {"sup3" : "³"}, {"supe" : "⊇"}, {"szlig" : "ß"}, {"tau" : "τ"}, {"there4" : "∴"}, {"theta" : "θ"}, {"thetasym" : "ϑ"}, {"thinsp" : " "}, {"thorn" : "þ"}, {"tilde" : "˜"}, {"times" : "×"}, {"trade" : "™"}, {"uArr" : "⇑"}, {"uacute" : "ú"}, {"uarr" : "↑"}, {"ucirc" : "û"}, {"ugrave" : "ù"}, {"uml" : "¨"}, {"upsih" : "ϒ"}, {"upsilon" : "υ"}, {"uuml" : "ü"}, {"weierp" : "℘"}, {"xi" : "ξ"}, {"yacute" : "ý"}, {"yen" : "¥"}, {"yuml" : "ÿ"}, {"zeta" : "ζ"}, {"zwj" : "‍"}, {"zwnj" : "‌"}]'::jsonb;

	    for r in
		select distinct ce.ch, ce.name
		from
		    (SELECT (e)."key" AS "name", (e).value AS ch FROM ( SELECT jsonb_each_text(jsonb_array_elements(chars)) AS e ) a ) AS ce
		    inner join (
			select name[1] "name"
			from regexp_matches(t, '&([A-Za-z]+?);', 'g') r(name)
		    ) s on ce."name" = s.name
	    loop
		t := replace(t, '&' || r.name || ';', r.ch);
	    end loop;

	    for r in
		select distinct
		    hex[1] hex,
		    ('x' || repeat('0', 8 - length(hex[1])) || hex[1])::bit(32)::int codepoint
		from regexp_matches(t, '&#x([0-9a-f]{1,8}?);', 'gi') s(hex)
	    loop
		t := regexp_replace(t, '&#x' || r.hex || ';', chr(r.codepoint), 'gi');
	    end loop;

	    for r in
		select distinct
		    chr(codepoint[1]::int) ch,
		    codepoint[1] codepoint
		from regexp_matches(t, '&#([0-9]{1,10}?);', 'g') s(codepoint)
	    loop
		t := replace(t, '&#' || r.codepoint || ';', r.ch);
	    end loop;

	    return t;
	end;
	$function$
;


CREATE OR REPLACE FUNCTION orcid_is_valid(_string text)
 RETURNS boolean
 LANGUAGE plpgsql
 IMMUTABLE
 STRICT
AS $function$
DECLARE el TEXT;
DECLARE checksum int;
DECLARE checksum_s TEXT;
BEGIN
    checksum = 0;
    IF _string ~ '^[0-9]{15}[0-9|X]$' IS NOT TRUE THEN
        RAISE NOTICE '% is not a valid ORCID. Must be a 16 digits number', _string;
        RETURN FALSE;
    END IF;

    /**Checksum calculation**/
    FOREACH el IN ARRAY regexp_split_to_array(LEFT(_string,15), '')
    LOOP
        checksum = (el::int + checksum)*2;
    END LOOP;
    checksum = 12 - (checksum % 11) % 11;
    IF checksum = 10 THEN checksum_s = 'X'; --MAP 10 TO X
    ELSE checksum_s = checksum::int;
    END IF;

    IF checksum_s != right(_string,1) THEN
        RAISE NOTICE 'Invalid checksum for ORCID %',_string;
        RETURN FALSE;
    END IF;
    RETURN TRUE;
END;
$function$;
--CREATE DOMAIN orcid AS text CHECK(orcid_is_valid(VALUE));


CREATE OR REPLACE FUNCTION int_or_null(_myinput text)
    RETURNS int
    LANGUAGE SQL
    IMMUTABLE
    STRICT
AS $function$
    --Jean-Baptiste DESBAS. 2022-04-07
    --Retourne un INTEGER celui-ci peut être casté, sinon retourne NULL.
    SELECT CASE WHEN (btrim(_myinput::TEXT) ~ '^\d+$'::text) IS NOT TRUE THEN NULL::integer ELSE _myinput::integer END;
$function$;


CREATE OR REPLACE FUNCTION uuid_or_null(str text)
RETURNS uuid
LANGUAGE plpgsql
IMMUTABLE
STRICT
AS $$
   --https://stackoverflow.com/a/46433640/10995624
   --Add IMMUTABLE STRICT
BEGIN
  RETURN str::uuid;
EXCEPTION WHEN invalid_text_representation THEN
  RETURN NULL;
END;
$$;