'''
Created on 2022-03-4
@author: wf
'''
import unittest
from lodstorage.trulytabular import TrulyTabular, WikidataItem, WikidataProperty
from lodstorage.query import Query, QuerySyntaxHighlight, Endpoint
from lodstorage.sparql import SPARQL
from pprint import pprint
from urllib.error import HTTPError
[docs]class TestTrulyTabular(unittest.TestCase):
'''
test Truly tabular analysis
'''
[docs] def setUp(self):
self.debug=False
qleverEndpoint=Endpoint()
qleverEndpoint.name="qlever-wikidata"
qleverEndpoint.method="POST"
qleverEndpoint.database="qlever"
qleverEndpoint.endpoint="https://qlever.cs.uni-freiburg.de/api/wikidata"
self.endpointConfs={qleverEndpoint,Endpoint.getDefault()}
pass
[docs] def handleServiceUnavailable(self,ex,endpointConf):
"""
handle service unavailable
Args:
ex(Exception): the exception to handle
endpointConf(Endpoint): the endpoint for which there is a problem
"""
if "503" in str(ex):
print(f"{endpointConf.name} at {endpointConf.endpoint} returns 503 Service Unavailable",flush=True)
else:
raise(ex)
[docs] def tearDown(self):
pass
[docs] def testGetFirst(self):
'''
test the get First helper function
'''
tt=TrulyTabular("Q2020153")
testcases=[
{
"qlod":[{"name":"firstname"}],
"expected": "firstname"
},
{
"qlod":[],
"expected": None
},
{
"qlod":[{"name":"firstname"},{"name":"second name"}],
"expected": None
}
]
for testcase in testcases:
qLod=testcase["qlod"]
expected=testcase["expected"]
try:
value=tt.sparql.getFirst(qLod,"name")
self.assertEqual(expected,value)
except Exception as ex:
if self.debug:
print(str(ex))
self.assertIsNone(expected)
[docs] def documentQuery(self,tt,query,show=True,formats=["mediawiki"]):
'''
document the given query for the given TrueTabular instance
'''
qlod=tt.sparql.queryAsListOfDicts(query.query)
for tablefmt in formats:
tryItUrl="https://query.wikidata.org/"
doc=query.documentQueryResult(qlod, tablefmt=tablefmt,tryItUrl=tryItUrl,floatfmt=".0f")
docstr=doc.asText()
if show:
print (docstr)
[docs] def testGetPropertiesByLabel(self):
'''
try getting properties by label
'''
debug=self.debug
#debug=True
propertyLabels=["title","country","location"]
for endpointConf in self.endpointConfs:
try:
tt=TrulyTabular("Q2020153",propertyLabels=propertyLabels,endpointConf=endpointConf)
if debug:
print (tt.properties)
for prop in propertyLabels:
self.assertTrue(prop in tt.properties)
except (Exception,HTTPError) as ex:
self.handleServiceUnavailable(ex,endpointConf)
pass
[docs] def testGetPropertiesById(self):
'''
try getting properties by label
'''
debug=self.debug
#debug=True
propertyIds=["P1800"]
expected=["Wikimedia database name"]
for endpointConf in self.endpointConfs:
try:
sparql=SPARQL(endpointConf.endpoint,method=endpointConf.method)
propList=WikidataProperty.getPropertiesByIds(sparql, propertyIds, lang="en")
for i,prop in enumerate(propList):
if debug:
print(f"{endpointConf.name} {i}:{prop}")
self.assertEqual(prop,expected[i])
except (Exception,HTTPError) as ex:
self.handleServiceUnavailable(ex,endpointConf)
pass
[docs] def testGetItemsByLabel(self):
'''
try getting items by label
'''
debug=self.debug
debug=True
qLabels=["academic conference","scientific conference series","whisky distillery","human"]
for endpointConf in self.endpointConfs:
try:
sparql=SPARQL(endpointConf.endpoint,method=endpointConf.method)
items={}
for qLabel in qLabels:
items4Label=WikidataItem.getItemsByLabel(sparql, qLabel,debug=debug)
count=len(items4Label)
if debug:
print(f"found {count} items for label {qLabel}")
self.assertTrue(count>0)
for i,item in enumerate(items4Label):
if debug:
print(f"{endpointConf.name} {i+1}:{item}")
items[qLabel]=items4Label[0]
for qLabel in qLabels:
self.assertTrue(qLabel in items)
except (Exception,HTTPError) as ex:
self.handleServiceUnavailable(ex,endpointConf)
pass
[docs] def testTrulyTabularTables(self):
'''
test Truly Tabular for different tabular queries
'''
debug=self.debug
#debug=True
show=False
showStats=["mediawiki","github","latex"]
tables=[
{
"name": "computer scientist",
"title": "humans with the occupation computer scientist",
"qid":"Q5", # human
"where": "?item wdt:P106 wd:Q82594.", # computer scientist only
"propertyLabels": ["sex or gender","date of birth","place of birth","field of work","occupation","ORCID iD",
"GND ID","DBLP author ID","Google Scholar author ID","VIAF ID"],
"expected": 10
},
{
"name": "academic conferences",
"title": "academic conferences",
"qid": "Q2020153",# academic conference
"propertyLabels":["title","country","location","short name","start time",
"end time","part of the series","official website","described at URL",
"WikiCFP event ID","GND ID","VIAF ID","main subject","language used",
"is proceedings from"
],
"expected": 7500
},
{
"name": "scientific conferences series",
"title": "scientific conference series",
"qid": "Q47258130", # scientific conference series
"propertyLabels":["title","short name","inception","official website","DBLP venue ID","GND ID",
"Microsoft Academic ID","Freebase ID","WikiCFP conference series ID",
"Publons journals/conferences ID","ACM conference ID"],
"expected": 4200
},
{
"name": "whisky distilleries",
"title": "whisky distilleries",
"qid": "Q10373548", # whisky distillery
"propertyLabels":["inception","official website","owned by","country","headquarters location","Whiskybase distillery ID"],
"expected": 200
}
]
errors=0
for table in tables[3:]:
# academic conference
where=None
if "where" in table:
where=table["where"]
tt=TrulyTabular(table["qid"],table["propertyLabels"],where=where,debug=debug)
if "is proceedings from" in tt.properties:
tt.properties["is proceedings from"].reverse=True
count,query=tt.count()
if (debug):
print(count)
self.assertTrue(count>table["expected"])
stats=tt.getPropertyStatistics()
# sort descending by total percentage
stats = sorted(stats, key=lambda row: row['total%'],reverse=True)
for tablefmt in showStats:
query=Query(name=table["name"],title=table["title"],query="")
doc=query.documentQueryResult(stats, tablefmt=tablefmt, withSourceCode=False)
if debug:
print(doc)
if show:
for wdProperty in tt.properties.values():
for asFrequency in [True,False]:
query=tt.noneTabularQuery(wdProperty,asFrequency=asFrequency)
try:
self.documentQuery(tt, query)
except Exception as ex:
print(f"query for {wdProperty} failed\n{str(ex)}")
errors+=1
self.assertEqual(0,errors)
[docs] def testMostFrequentProperties(self):
'''
test getting the most frequent properties for some Wikidata Item types
'''
#show=True
show=False
debug=self.debug
#debug=True
for endpointConf in self.endpointConfs:
for qid in ["Q6256"]:
try:
tt=TrulyTabular(qid,debug=debug,endpointConf=endpointConf)
for minCount in [0,100]:
query=tt.mostFrequentPropertiesQuery(minCount=minCount)
if debug:
print(query.query)
self.documentQuery(tt, query,formats=["github"],show=show)
except (Exception,HTTPError) as ex:
self.handleServiceUnavailable(ex,endpointConf)
pass
[docs] def testSyntaxHighlighting(self):
'''
https://github.com/WolfgangFahl/pyLoDStorage/issues/81
'''
debug=self.debug
#debug=True
qid="Q6256" # country
tt=TrulyTabular(qid,debug=debug)
query=tt.mostFrequentPropertiesQuery()
sh=QuerySyntaxHighlight(query,"html")
html=sh.highlight()
if debug:
print(html)
self.assertTrue('<span class="k">SELECT</span>' in html)
pass
[docs] def testCount(self):
'''
test the count function of truly tabular
'''
debug=self.debug
debug=True
qid="Q55488" # railway stations
for endpointConf in self.endpointConfs:
try:
tt=TrulyTabular(qid,endpointConf=endpointConf,debug=debug)
count,query=tt.count()
if debug:
print(query)
print(f"count of railway stations is {count}")
self.assertTrue(qid in query)
self.assertTrue(count>=106195)
self.assertTrue(tt.error is None)
except (Exception,HTTPError) as ex:
self.handleServiceUnavailable(ex,endpointConf)
[docs] def testGenerateSparqlQuery(self):
'''
test Generating a SPARQL query
'''
configs=[
{
"naive":True,
"qid": "Q2020153", # academic conference
"subclassPredicate": "wdt:P31",
"propertyIdMap": {
"P1813": ["label"],
"P17": ["label"],
"P1476": ["label"]
},
"expected": []
},
{
"naive":False,
"qid": "Q2020153", # academic conference
"subclassPredicate": "wdt:P31",
"propertyIdMap": {
"P1813": ["sample"],
"P17": ["sample"],
"P1476": ["sample"]
},
"expected": ["GROUP BY","SAMPLE"]
},
{
"naive":False,
"qid": "Q2020153", # academic conference
"subclassPredicate": "wdt:P31",
"propertyIdMap": {
"P1813": ["count","list"],
"P17": ["sample","ignore"],
"P1476": ["count","list"]
},
"expected": ["COUNT (DISTINCT","GROUP BY","GROUP_CONCAT (DISTINCT","HAVING"]
},
{
"naive":False,
"qid": "Q1667921", # novel series
"subclassPredicate": "wdt:P31",
"propertyIdMap": {
"P50": ["sample","ignore"], # author
"P136": ["sample","ignore"],# genre
"P1476": ["sample","ignore"] #title
},
"expected": ["GROUP BY","HAVING","COUNT","<=1"]
},
{
"naive":False,
"qid": "Q1667921", # novel series
"subclassPredicate": "wdt:P31",
"propertyIdMap": {
"P50": ["sample","ignore","label"], # author
"P136": ["sample","ignore","label"],# genre
"P1476": ["sample","ignore"] #title
},
"expected": ["GROUP BY","HAVING","COUNT","<=1"]
},
{
"naive":False,
"subclassPredicate": "wdt:P279*/wdt:P31*",
"qid": "Q8063", # rock
"propertyIdMap": {
"P18": ["sample"], # image
},
"expected": ["P279"]
}
]
debug=self.debug
debug=True
# loop over different test configurations
for i,config in enumerate(configs):
# get the test configuration
qid=config["qid"]
naive=config["naive"]
propertyIdMap=config["propertyIdMap"]
subclassPredicate=config["subclassPredicate"]
expectedList=config["expected"]
# create a truly tabular analysis
tt=TrulyTabular(qid, propertyIds=list(propertyIdMap.keys()),subclassPredicate=subclassPredicate)
varname=tt.item.itemVarname
# generate a SPARQL Query
sparqlQuery=tt.generateSparqlQuery(genMap=propertyIdMap,naive=naive)
if debug:
print(f"config {i}:")
pprint(config)
print(f"{sparqlQuery}")
# all queries should have basic graph patterns for the subclass
self.assertTrue(f"?{varname} {subclassPredicate} wd:{qid}." in sparqlQuery)
# and for the properties
for pid in propertyIdMap.keys():
self.assertTrue(f"?{varname} wdt:{pid}" in sparqlQuery)
for expected in expectedList:
self.assertTrue(expected in sparqlQuery,f"config {i}:{expected} missing")
if __name__ == "__main__":
#import sys;sys.argv = ['', 'Test.testName']
unittest.main()