diff --git a/update_microbio_corpus.snakefile b/update_microbio_corpus.snakefile index dca1c3557467f907e832eee4229a4e1ab036513e..3756815323e4238b7ec36e3b217b15737608a21f 100644 --- a/update_microbio_corpus.snakefile +++ b/update_microbio_corpus.snakefile @@ -7,20 +7,42 @@ all ''' rule all: input: - corpus=directory("corpora/pubmed/batches-new") + corpus=directory("corpora/pubmed/batches"), + meta="corpora/pubmed/info.csv" ''' generate microbio corpus from the pubmed databank ''' -rule generate_concept_path: +rule generate_corpora: input: db=config["PUBMED_DB"] output: - corpus=directory("corpora/pubmed/batches-new") + corpus=directory("corpora/pubmed/batches") params: request="corpora/pubmed/microbio-mesh-terms.txt" conda: "softwares/envs/pubmed-index-env.yaml" shell: """ pubmed-search -index {input.db} -outdir {output.corpus} -xml %%/batch.xml -mesh-tree-query {params.request} -batch 1000 - """ \ No newline at end of file + """ + +''' +get metadata +''' +rule get_meta: + input: + corpora="corpora/pubmed/batches" + output: + meta="corpora/pubmed/info.csv" + run: + import datetime + import pandas + date = datetime.datetime.now().strftime("%Y%m%d") + batches, = glob_wildcards(input.corpora + "/{id}/batch.xml") + corpus_size = len(batches) + data = {"source": "PubMed", + "name": ["microbio"], + "size (nb. batches x 1000 abstracts)" : [corpus_size], + "date" : [date]} + df = pandas.DataFrame(data) + df.to_csv(output.meta, index=False)