Source code for tests.testTrulyTabular

'''
Created on 2022-03-4

@author: wf
'''
import unittest
from lodstorage.trulytabular import TrulyTabular, WikidataItem, WikidataProperty
from lodstorage.query import Query, QuerySyntaxHighlight, Endpoint
from lodstorage.sparql import SPARQL
from pprint import pprint
from urllib.error import HTTPError

[docs]class TestTrulyTabular(unittest.TestCase):
    '''
    test Truly tabular analysis
    '''

[docs]    def setUp(self):
        self.debug=False
        qleverEndpoint=Endpoint()
        qleverEndpoint.name="qlever-wikidata"
        qleverEndpoint.method="POST"
        qleverEndpoint.database="qlever"
        qleverEndpoint.endpoint="https://qlever.cs.uni-freiburg.de/api/wikidata"
        self.endpointConfs={qleverEndpoint,Endpoint.getDefault()}
        pass
    
[docs]    def handleServiceUnavailable(self,ex,endpointConf):
        """
        handle service unavailable
        
        Args:
            ex(Exception): the exception to handle
            endpointConf(Endpoint): the endpoint for which there is a problem
        """
        if "503" in str(ex):
            print(f"{endpointConf.name} at {endpointConf.endpoint} returns 503 Service Unavailable",flush=True)
        else:
            raise(ex)

[docs]    def tearDown(self):
        pass

[docs]    def testGetFirst(self):
        '''
        test the get First helper function
        '''
        tt=TrulyTabular("Q2020153")
        testcases=[
            { 
                "qlod":[{"name":"firstname"}],
                "expected": "firstname"
            },
            {
                "qlod":[],
                "expected": None
            },
            {
                "qlod":[{"name":"firstname"},{"name":"second name"}],
                "expected": None
            }
        ]
        for testcase in testcases:
            qLod=testcase["qlod"]
            expected=testcase["expected"]
            try:
                value=tt.sparql.getFirst(qLod,"name")
                self.assertEqual(expected,value)
            except Exception as ex:
                if self.debug:
                    print(str(ex))
                self.assertIsNone(expected)
                
[docs]    def documentQuery(self,tt,query,show=True,formats=["mediawiki"]):
        '''
        document the given query for the given TrueTabular instance
        '''
        qlod=tt.sparql.queryAsListOfDicts(query.query)
        for tablefmt in formats:
            tryItUrl="https://query.wikidata.org/"
            doc=query.documentQueryResult(qlod, tablefmt=tablefmt,tryItUrl=tryItUrl,floatfmt=".0f")
            docstr=doc.asText()
            if show:
                print (docstr)
                
[docs]    def testGetPropertiesByLabel(self):
        '''
        try getting properties by label
        '''
        debug=self.debug
        #debug=True
        propertyLabels=["title","country","location"]
        for endpointConf in self.endpointConfs:
            try:
                tt=TrulyTabular("Q2020153",propertyLabels=propertyLabels,endpointConf=endpointConf)
                if debug:
                    print (tt.properties)
                for prop in propertyLabels:
                    self.assertTrue(prop in tt.properties)
            except (Exception,HTTPError) as ex: 
                    self.handleServiceUnavailable(ex,endpointConf)
                    pass
            
[docs]    def testGetPropertiesById(self):
        '''
        try getting properties by label
        '''
        debug=self.debug
        #debug=True
        propertyIds=["P1800"]
        expected=["Wikimedia database name"]
        for endpointConf in self.endpointConfs:
            try:
                sparql=SPARQL(endpointConf.endpoint,method=endpointConf.method)
                propList=WikidataProperty.getPropertiesByIds(sparql, propertyIds, lang="en")
                for i,prop in enumerate(propList):
                    if debug:
                        print(f"{endpointConf.name} {i}:{prop}")
                    self.assertEqual(prop,expected[i])
            except (Exception,HTTPError) as ex: 
                    self.handleServiceUnavailable(ex,endpointConf)
                    pass
                
[docs]    def testGetItemsByLabel(self):
        '''
        try getting items by label
        '''
        debug=self.debug
        debug=True
        qLabels=["academic conference","scientific conference series","whisky distillery","human"]
        for endpointConf in self.endpointConfs:
            try:
                sparql=SPARQL(endpointConf.endpoint,method=endpointConf.method)
                items={}
                for qLabel in qLabels:
                    items4Label=WikidataItem.getItemsByLabel(sparql, qLabel,debug=debug)
                    count=len(items4Label)
                    if debug:
                        print(f"found {count} items for label {qLabel}")
                    self.assertTrue(count>0)
                    for i,item in enumerate(items4Label):
                        if debug:
                            print(f"{endpointConf.name} {i+1}:{item}")
                    items[qLabel]=items4Label[0]
                for qLabel in qLabels:
                    self.assertTrue(qLabel in items)
            except (Exception,HTTPError) as ex: 
                    self.handleServiceUnavailable(ex,endpointConf)
                    pass

[docs]    def testTrulyTabularTables(self):
        '''
        test Truly Tabular for different tabular queries
        '''
        debug=self.debug
        #debug=True
        show=False
        showStats=["mediawiki","github","latex"]
        tables=[ 
            {
               "name": "computer scientist",
               "title": "humans with the occupation computer scientist",
               "qid":"Q5", # human
               "where": "?item wdt:P106 wd:Q82594.", # computer scientist only
               "propertyLabels": ["sex or gender","date of birth","place of birth","field of work","occupation","ORCID iD",
                                  "GND ID","DBLP author ID","Google Scholar author ID","VIAF ID"],
               "expected": 10  
            },
            {
               "name": "academic conferences",
               "title": "academic conferences",
               "qid": "Q2020153",# academic conference
               "propertyLabels":["title","country","location","short name","start time",
                "end time","part of the series","official website","described at URL",
                "WikiCFP event ID","GND ID","VIAF ID","main subject","language used",
                "is proceedings from"
               ],
               "expected": 7500
            },
            {
                "name": "scientific conferences series",
                "title": "scientific conference series",
                "qid": "Q47258130", # scientific conference series
                "propertyLabels":["title","short name","inception","official website","DBLP venue ID","GND ID",
                    "Microsoft Academic ID","Freebase ID","WikiCFP conference series ID",
                    "Publons journals/conferences ID","ACM conference ID"],
                "expected": 4200
            },
            {
                "name": "whisky distilleries",
                "title": "whisky distilleries",
                "qid": "Q10373548", # whisky distillery
                "propertyLabels":["inception","official website","owned by","country","headquarters location","Whiskybase distillery ID"],
                "expected": 200
            }
        ]
        errors=0
        for table in tables[3:]:
            # academic conference
            where=None
            if "where" in table:
                where=table["where"]
            tt=TrulyTabular(table["qid"],table["propertyLabels"],where=where,debug=debug)
            if "is proceedings from" in tt.properties:
                tt.properties["is proceedings from"].reverse=True
            count,query=tt.count()
            if (debug):
                print(count)
            self.assertTrue(count>table["expected"])
            stats=tt.getPropertyStatistics()
            # sort descending by total percentage
            stats = sorted(stats, key=lambda row: row['total%'],reverse=True) 
            for tablefmt in showStats:
                query=Query(name=table["name"],title=table["title"],query="")
                doc=query.documentQueryResult(stats, tablefmt=tablefmt, withSourceCode=False)
                if debug:
                    print(doc)
            if show:
                for wdProperty in tt.properties.values():
                    for asFrequency in [True,False]:
                        query=tt.noneTabularQuery(wdProperty,asFrequency=asFrequency)
                        try:
                            self.documentQuery(tt, query)
                        except Exception as ex:
                            print(f"query for {wdProperty} failed\n{str(ex)}")
                            errors+=1
                self.assertEqual(0,errors)
            
                
[docs]    def testMostFrequentProperties(self):
        '''
        test getting the most frequent properties for some Wikidata Item types
        '''
        #show=True
        show=False
        debug=self.debug
        #debug=True
        for endpointConf in self.endpointConfs:
            for qid in ["Q6256"]:
                try:
                    tt=TrulyTabular(qid,debug=debug,endpointConf=endpointConf)
                    for minCount in [0,100]:
                        query=tt.mostFrequentPropertiesQuery(minCount=minCount)
                        if debug:
                            print(query.query)
                        self.documentQuery(tt, query,formats=["github"],show=show)
                except (Exception,HTTPError) as ex: 
                    self.handleServiceUnavailable(ex,endpointConf)
                    pass

[docs]    def testSyntaxHighlighting(self):
        '''
        https://github.com/WolfgangFahl/pyLoDStorage/issues/81
        '''
        debug=self.debug
        #debug=True
        qid="Q6256" # country
        tt=TrulyTabular(qid,debug=debug)
        query=tt.mostFrequentPropertiesQuery()
        sh=QuerySyntaxHighlight(query,"html")
        html=sh.highlight()
        if debug:
            print(html)
        self.assertTrue('<span class="k">SELECT</span>' in html)
        pass
    
[docs]    def testCount(self):
        '''
        test the count function of truly tabular
        '''
        debug=self.debug  
        debug=True
        qid="Q55488" # railway stations
        for endpointConf in self.endpointConfs:
            try:
                tt=TrulyTabular(qid,endpointConf=endpointConf,debug=debug)
                count,query=tt.count()
                if debug:
                    print(query)
                    print(f"count of railway stations is {count}")
                self.assertTrue(qid in query)
                self.assertTrue(count>=106195)
                self.assertTrue(tt.error is None)
            except (Exception,HTTPError) as ex: 
                self.handleServiceUnavailable(ex,endpointConf)
              
[docs]    def testGenerateSparqlQuery(self):
        '''
        test Generating a SPARQL query
        '''
        configs=[

            {
                "naive":True,
                "qid": "Q2020153", # academic conference
                "subclassPredicate": "wdt:P31",
                "propertyIdMap": {
                    "P1813": ["label"],
                    "P17": ["label"],
                    "P1476": ["label"]
                },
                "expected": []
            },
            {
                "naive":False,
                "qid": "Q2020153", # academic conference
                "subclassPredicate": "wdt:P31",
                "propertyIdMap": {
                    "P1813": ["sample"],
                    "P17": ["sample"],
                    "P1476": ["sample"]
                },
                "expected": ["GROUP BY","SAMPLE"]
            },
            {
                "naive":False,
                "qid": "Q2020153", # academic conference
                "subclassPredicate": "wdt:P31",
                "propertyIdMap": {
                    "P1813": ["count","list"],
                    "P17": ["sample","ignore"],
                    "P1476": ["count","list"]
                },
                "expected": ["COUNT (DISTINCT","GROUP BY","GROUP_CONCAT (DISTINCT","HAVING"]
            },
            {
                "naive":False,
                "qid": "Q1667921", # novel series
                "subclassPredicate": "wdt:P31",
                "propertyIdMap": {
                    "P50": ["sample","ignore"], # author
                    "P136": ["sample","ignore"],# genre
                    "P1476": ["sample","ignore"] #title
                },
                "expected": ["GROUP BY","HAVING","COUNT","<=1"]
            },
            {
                "naive":False,
                "qid": "Q1667921", # novel series
                "subclassPredicate": "wdt:P31",
                "propertyIdMap": {
                    "P50": ["sample","ignore","label"], # author
                    "P136": ["sample","ignore","label"],# genre
                    "P1476": ["sample","ignore"] #title
                },
                "expected": ["GROUP BY","HAVING","COUNT","<=1"]
            },
            {
                "naive":False,
                "subclassPredicate": "wdt:P279*/wdt:P31*",
                "qid": "Q8063", # rock
                "propertyIdMap": {
                    "P18": ["sample"], # image
                },
                "expected": ["P279"]
            }
            
        ]
        debug=self.debug
        debug=True
        # loop over different test configurations
        for i,config in enumerate(configs):
            # get the test configuration
            qid=config["qid"]
            naive=config["naive"]
            propertyIdMap=config["propertyIdMap"]
            subclassPredicate=config["subclassPredicate"]
            expectedList=config["expected"]
            
            # create a truly tabular analysis
            tt=TrulyTabular(qid, propertyIds=list(propertyIdMap.keys()),subclassPredicate=subclassPredicate)
            varname=tt.item.itemVarname
            # generate a SPARQL Query
            sparqlQuery=tt.generateSparqlQuery(genMap=propertyIdMap,naive=naive)
            if debug:
                print(f"config {i}:")
                pprint(config)
                print(f"{sparqlQuery}")
            # all queries should have basic graph patterns for the subclass
            self.assertTrue(f"?{varname} {subclassPredicate} wd:{qid}." in sparqlQuery)
            # and for the properties
            for pid in propertyIdMap.keys():
                self.assertTrue(f"?{varname} wdt:{pid}" in sparqlQuery)
            for expected in expectedList:
                self.assertTrue(expected in sparqlQuery,f"config {i}:{expected} missing")
    
if __name__ == "__main__":
    #import sys;sys.argv = ['', 'Test.testName']
    unittest.main()