diff --git a/update_microbio_corpus.snakefile b/update_microbio_corpus.snakefile
index dca1c3557467f907e832eee4229a4e1ab036513e..3756815323e4238b7ec36e3b217b15737608a21f 100644
--- a/update_microbio_corpus.snakefile
+++ b/update_microbio_corpus.snakefile
@@ -7,20 +7,42 @@ all
 '''
 rule all:
     input:
-        corpus=directory("corpora/pubmed/batches-new")
+        corpus=directory("corpora/pubmed/batches"),
+	meta="corpora/pubmed/info.csv"
 		
 
 '''
 generate microbio corpus from the pubmed databank
 '''
-rule generate_concept_path:
+rule generate_corpora:
     input:
         db=config["PUBMED_DB"]
     output:
-        corpus=directory("corpora/pubmed/batches-new")
+        corpus=directory("corpora/pubmed/batches")
     params:
         request="corpora/pubmed/microbio-mesh-terms.txt"
     conda: "softwares/envs/pubmed-index-env.yaml"
     shell: """
     pubmed-search -index {input.db} -outdir {output.corpus} -xml %%/batch.xml -mesh-tree-query {params.request} -batch 1000
-        """
\ No newline at end of file
+        """
+
+'''
+get metadata
+'''
+rule get_meta:
+	input:
+		corpora="corpora/pubmed/batches"
+	output:
+		meta="corpora/pubmed/info.csv"
+	run:
+		import datetime
+		import pandas
+		date = datetime.datetime.now().strftime("%Y%m%d")
+		batches, = glob_wildcards(input.corpora + "/{id}/batch.xml")
+		corpus_size = len(batches)
+		data = {"source": "PubMed", 
+			"name": ["microbio"], 
+			"size (nb. batches x 1000 abstracts)" : [corpus_size], 
+			"date" : [date]}
+		df = pandas.DataFrame(data)
+		df.to_csv(output.meta, index=False)