Files
linux-bin/pdf2bib.sh

99 lines
3.2 KiB
Bash
Raw Normal View History

2019-02-11 14:37:48 -08:00
#!/bin/bash
2020-02-13 12:11:29 -08:00
if [ "$1" == "-h" ] ; then
echo "
pdf2bib - search for a doi within a pdf, query pubmed, and append bibtex entry with pdf to your local bib database file. Last two steps are identical to sdoi.sh
usage:
pdf2bib.sh file.pdf
depends:
pdftotext - from ghostscript or poppler or texlive ?
xsltproc - xml processor, from GNOME project
pubmed2bibtex.xsl - xml processor stylesheet
defaults:
Set the three required default file locations (xsl file, bib file, pdf directory)
"
exit 0
fi
2019-02-11 14:37:48 -08:00
#Setup defaults
2020-02-13 12:11:29 -08:00
styleSheet=${pubmedStyleSheet:-$HOME/bin/pubmed2bibtex.xsl}
bibdFileOut=${bibdFileOut:-$HOME/projects/bibd/OMEGA.bib}
pdfPathOut=${pdfPathOut:-$HOME/projects/bibd/papers}
2019-02-11 14:37:48 -08:00
relPath=$(basename $pdfPathOut)
fn=$1
set -e #exit if an error
2020-02-13 12:11:29 -08:00
2019-02-11 14:37:48 -08:00
echo "using $pdfPathOut"
echo "using $bibdFileOut"
#try to extract doi from pdf and retrieve a pubmed id
2020-02-13 12:11:29 -08:00
#for 'DOI:' syntax
2021-04-08 20:52:17 -07:00
# doi=$(pdftotext -q -f 1 -l 1 $fn - | grep -i "doi:" --max-count=1 | tr [:upper:] [:lower:] | sed -E "s|doi:(.+)|\1|")
2021-03-30 00:37:43 -07:00
2021-04-08 20:52:17 -07:00
# search for doi string between first page last page 10
doi=$(pdftotext -q -f 1 -l 10 $fn - | grep -iE "doi:? ?/?10\." --max-count=1 | tr [:upper:] [:lower:] | sed -E "s|.*doi:? ?/?(10.+)|\1|")
2021-03-30 00:37:43 -07:00
2019-02-11 14:37:48 -08:00
2020-02-13 12:11:29 -08:00
#for 'https://doi.org' syntax
if [ -z "$doi" ]; then
2021-04-08 20:52:17 -07:00
doi=$(pdftotext -q -f 1 -l 1 $fn - | grep -iE "doi\.org/10\." --max-count=1 | tr [:upper:] [:lower:] | sed -E "s|.+doi\.org/(10.+)|\1|")
2020-02-13 12:11:29 -08:00
fi
2021-03-30 00:37:43 -07:00
# for 'https://doi.org' syntax
# if [ -z "$doi" ]; then
2021-04-08 20:52:17 -07:00
# doi=$(pdftotext -q -f 1 -l 1 $fn - | grep -i "doi.org/" --max-count=1 | tr [:upper:] [:lower:] | sed -E "s|.+doi\.org\/(.+)|\1|")
2021-03-30 00:37:43 -07:00
# fi
#
# if [ -z "$doi" ]; then
2021-04-08 20:52:17 -07:00
# doi=$(pdftotext -q -f 1 -l 1 $fn - | grep -iE "doi ?" --max-count=1 | tr [:upper:] [:lower:] | sed -E "s|doi ?(.+)|\1|")
2021-03-30 00:37:43 -07:00
# fi
2019-02-11 14:37:48 -08:00
if [ -z "$doi" ]; then
echo "doi not found"
exit 1
fi
2020-02-13 12:11:29 -08:00
2019-02-11 14:37:48 -08:00
## TODO: dedupe this with sdoi.sh
2021-04-08 20:52:17 -07:00
uid=$(curl -s "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term=$doi&field=doi&retmode=xml" | grep -E "<Id>[0-9]+</Id>" | sed -E "s|<Id>([0-9]+)</Id>|\1|")
2019-02-11 14:37:48 -08:00
if [ -z "$uid" ]; then
echo "pubmed id not found"
exit 1
fi
#request pubmed xml and transform into bibtex
curl -s "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&id=$uid&retmode=xml" > $uid.xml
xsltproc --novalid $styleSheet $uid.xml > $uid.bib
#extract some strings to make a nice filename for the pdf
key="LastName";
2021-04-08 20:52:17 -07:00
author=$(grep $key --max-count=1 $uid.xml | sed -E "s|\W*<$key>(.+)</$key>\W*|\1|" | tr -d " ")
2019-02-11 14:37:48 -08:00
key="MedlineTA";
2021-04-08 20:52:17 -07:00
journal=$(grep $key --max-count=1 $uid.xml | sed -E "s|\W*<$key>(.+)</$key>\W*|\1|" | tr -d " ")
2019-02-11 14:37:48 -08:00
key1="PubDate";
2021-04-08 20:52:17 -07:00
key2="Year"; year=$(awk "/<$key1>/,/<\/$key1>/" $uid.xml | grep $key2 | sed -E "s|\W*<$key2>(.+)</$key2>\W*|\1|")
2019-02-11 14:37:48 -08:00
fn2=${author}_${journal}$year-$uid.pdf
#move pdf file to papers repository, add file name to bibtex file field
mv $fn $pdfPathOut/$fn2
echo "moved to $pdfPathOut/$fn2"
sed -i -E "s|(\W*file = \{).*(\}.*)|\1$relPath/$fn2\2|" $uid.bib
if [[ -z $(rg $uid $bibdFileOut) ]]; then
#import bibtex
echo "importing $uid.bib"
cat $uid.bib >> $bibdFileOut
else
echo "$uid already found in $bibdFileOut, exiting"
fi
#clean up
rm $uid.xml $uid.bib