diff --git a/stats.py b/stats.py new file mode 100644 index 0000000000000000000000000000000000000000..5c7950846a6f0eba04ef150fb603fe582efacc0d --- /dev/null +++ b/stats.py @@ -0,0 +1,91 @@ +# Computes some stats about a corpus + +import sys +import os + + +def check_and_incr(text, token, dic): + if token in text: + if token not in dic: dic[token] = 0 + dic[token] += 1 + + + +class Stats: + """ Computes statistics about a corpus, such as the number of modalities and the number of actions """ + + modalities = ["odrl:permission", "odrl:obligation", "odrl:prohibition"] + actions = ["cc:Distribution", "odrl:play", "odrl:run", "cc:Reproduction", + "odrl:use", "odrl:display", "odrl:copy", "odrl:sell", "odrl:modify", "odrl:derive"] + + def __init__(self): + self.nb_mods = {} + self.nb_actions = {} + self.nb_ands = 0 + self.nb_nots = 0 + + def count_modalities(self, text): + for mod in self.modalities: + check_and_incr(text, mod, self.nb_mods) + + def count_actions(self, text): + for act in self.actions: + check_and_incr(text, act, self.nb_actions) + + def count_and(self, text): + count_and = {" and ": 0} + check_and_incr(text, " and ", count_and) + self.nb_ands += count_and[" and "] + + def count_not(self, text): + count_not = {" not ": 0, " cannot ": 0} + check_and_incr(text, " not ", count_not) + check_and_incr(text, " cannot ", count_not) + self.nb_nots += max(count_not[" not "], count_not[" cannot "]) + + def __str__(self): + s = "Number of modalities:\n" + for uri in self.nb_mods: + s += f"\t{uri}: {self.nb_mods[uri]}\n" + s += "Number of actions:\n" + for uri in self.nb_actions: + s += f"\t{uri}: {self.nb_actions[uri]}\n" + s += f"Number of and: {self.nb_ands}\nNumber of not: {self.nb_nots}\n" + return s + + + +def main_stats(path_sentences, path_odrl): + + stats = Stats() + + for fname in os.listdir(path_odrl): + if fname.endswith(".ttl"): + with open(path_odrl + fname, "r") as fodrl: + odrl_text = fodrl.read() + stats.count_modalities(odrl_text) + stats.count_actions(odrl_text) + + for fname in os.listdir(path_sentences): + if fname.endswith(".txt"): + with open(path_sentences + fname, "r") as ftext: + sent_text = ftext.read() + stats.count_and(sent_text) + stats.count_not(sent_text) + + return stats + + +if __name__ == "__main__": + + if len(sys.argv) < 3: + print(f"Usage: python3 {sys.argv[0]} <path_corpus_sentences> <path_corpus_odrl>") + exit(1) + + path_sentences = sys.argv[1] + path_odrl = sys.argv[2] + + stats = main_stats(path_sentences, path_odrl) + + print(stats) +