Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.markdown
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ If you want to run the test suite, make sure the test requirements are also inst

## Testing

Assuming you have installed the test requirements, you shoule be able to run
Assuming you have installed the test requirements, you should be able to run

python -m pytest --conf-file tests/appconfig.ini

Expand Down
11 changes: 6 additions & 5 deletions oz_tree_build/wiki_extraction/add_dates_and_species_to_tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,11 +118,12 @@ def process_interior_node_recursive_and_get_range(node):
# If the oldest child's date is later or the same as this node's date, adjust this node's date
# to be just after the oldest child's date
if oldest_child_from_date >= date_range[0]:
logging.warning(
f"Node '{taxon}' has from_date {node_data['from_date']}, "
f"but its child {child_with_oldest_from_date.taxon} "
f"has from_date {oldest_child_from_date}"
)
if oldest_child_from_date > date_range[0]:
logging.warning(
f"Node '{taxon}' has from_date {node_data['from_date']}, "
f"but its child {child_with_oldest_from_date.taxon} "
f"has from_date {oldest_child_from_date}"
)
date_range[0] = round(oldest_child_from_date + 0.001, 10)
else:
# If we don't have a range for this node, use the oldest child's date,
Expand Down
23 changes: 23 additions & 0 deletions oz_tree_build/wiki_extraction/mwparserfromhell_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,29 @@ def get_id_and_text_from_wiki_page(page_title):
return None, None


def get_qid_from_wiki_page_props(page_title, redirect):
# Doc: https://www.mediawiki.org/wiki/API:Pageprops
params = {
"action": "query",
"prop": "pageprops",
"titles": page_title,
"format": "json",
"formatversion": "2",
}
if redirect:
params["redirects"] = "1"
headers = {"User-Agent": "My-Bot-Name/1.0"}
req = session.get(API_URL, headers=headers, params=params, allow_redirects=True)
res = req.json()
try:
page = res["query"]["pages"][0]
qid_string = page["pageprops"].get("wikibase_item")
return int(qid_string.strip()[1:])
except (KeyError, AttributeError):
logging.info(f"Could not find QID from page props for page '{page_title}' (redirect={redirect})")
return None


def get_wikicode_for_string(wiki_string) -> mwparserfromhell.wikicode.Wikicode:
return mwparserfromhell.parse(wiki_string, skip_style_tags=True)

Expand Down
1 change: 1 addition & 0 deletions oz_tree_build/wiki_extraction/period_date_ranges.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,7 @@
"burdigalian": [20.44, 15.97],
"middle miocene": [15.97, 11.63],
"langhian": [15.97, 13.82],
"barstovian": [16.3, 13.6],
"serravallian": [13.82, 11.63],
"late miocene": [11.63, 5.333],
"tortonian": [11.63, 7.246],
Expand Down
37 changes: 28 additions & 9 deletions oz_tree_build/wiki_extraction/wiki_taxon_page_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

from oz_tree_build.wiki_extraction.mwparserfromhell_helpers import (
get_display_string_from_wikicode,
get_qid_from_wiki_page_props,
get_taxon_name,
get_wikicode_for_page,
get_wikicode_template,
Expand Down Expand Up @@ -50,7 +51,7 @@ def get_date_range_from_taxobox(taxobox):
# Template name can randomly be be "fossil range" or "geological range", with or without space/underscores
fossil_range_template = get_wikicode_template(
fossil_range,
("fossilrange", "geologicalrange", "geologicalrange/linked", "geologicalage"),
("fossilrange", "geologicalrange", "geologicalrange/linked", "geologicalage", "temporalrange"),
)

if not fossil_range_template:
Expand All @@ -62,16 +63,27 @@ def get_date_range_from_taxobox(taxobox):
from_date = get_range_date(range_string, use_start=True)
to_date = get_range_date(range_string, use_start=False)
else:
# If the first param is "earliest", we skip it.
# Skip params that look like "earliest"
# e.g. Stegosauria has {{fossilrange|earliest=174|169|100|latest=66}}
param_index = 0
if fossil_range_template.params[0].name == "earliest":
param_index = 1
# Note that they're not always the first one listed
def skip_earliest(idx):
if idx < len(fossil_range_template.params) and str(fossil_range_template.params[idx].name).startswith(
"earliest"
):
idx += 1
return idx

param_index = skip_earliest(0)

from_date = get_range_date(fossil_range_template.params[param_index].value, use_start=True)
param_index += 1

param_index = skip_earliest(param_index)

# If there is no end date, we fall back to the start date
to_date = (
get_range_date(fossil_range_template.params[param_index + 1].value, use_start=False)
if len(fossil_range_template.params) >= param_index + 2
get_range_date(fossil_range_template.params[param_index].value, use_start=False)
if len(fossil_range_template.params) >= param_index + 1
else from_date
)

Expand Down Expand Up @@ -199,8 +211,15 @@ def get_taxon_data_from_wikipedia_page(taxon, page_title, is_leaf):
if not from_date:
logging.warning(f"Could not find fossil range for {taxon}")

# Get the Wikidata QID, if any
qid = get_qid_from_wikicode(wikicode)
# 1. We first try to get the Wikidata QID from the page props, ignoring redirects. e.g. in a case like Averostra,
# which has no actual wikipedia page and redirects to Theropoda, this will give us the correct Averostra QID.
# 2. If that fails, we try to get the QID from the taxobox.
# 3. If that fails, we try to get the QID from the page props, following redirects.
qid = get_qid_from_wiki_page_props(page_title, False)
if not qid:
qid = get_qid_from_wikicode(wikicode)
if not qid:
qid = get_qid_from_wiki_page_props(page_title, True)
if qid:
node_data["qid"] = qid

Expand Down
7 changes: 6 additions & 1 deletion tests/test_format_newick.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,12 @@

import oz_tree_build.newick.format_newick as format_newick

test_tree = "(A,(BA,((BBAA_ott123[Comment 1],BBAB,BBAC,'BBAD (foo)')'BAA (bar)'[Comment 2 (Hello)],(BBBA)BBB,(BBCA:12.34,BBCB)BBC_ott456:78.9)BB)B_ott789,((CAA,CAB),CB)C,D)Root;"
test_tree = (
"(A,(BA,((BBAA_ott123[Comment 1],BBAB,BBAC,'BBAD (foo)')"
"'BAA (bar)'[Comment 2 (Hello)],(BBBA)BBB,"
"(BBCA:12.34,BBCB)BBC_ott456:78.9)BB)B_ott789,"
"((CAA,CAB),CB)C,D)Root;"
)

formatted_test_tree = """(
A,
Expand Down