pdf2bib.sh

#!/bin/bash
if [ "$1" == "-h" ] ; then
    echo "
        pdf2bib - search for a doi within a pdf, query pubmed, and append bibtex entry with pdf to your local bib database file. Last two steps are identical to sdoi.sh

         usage:
          pdf2bib.sh file.pdf

         depends:
          pdftotext - from ghostscript or poppler or texlive ?
          xsltproc - xml processor, from GNOME project
          pubmed2bibtex.xsl - xml processor stylesheet

         defaults:
          Set the three required default file locations (xsl file, bib file, pdf directory)
          "
    exit 0
fi

#Setup defaults
styleSheet=${pubmedStyleSheet:-$HOME/bin/pubmed2bibtex.xsl}
bibdFileOut=${bibdFileOut:-$HOME/projects/bibd/OMEGA.bib}
pdfPathOut=${pdfPathOut:-$HOME/projects/bibd/papers}
relPath=$(basename $pdfPathOut)
fn=$1

set -e #exit if an error

echo "using $pdfPathOut"
echo "using $bibdFileOut"

#try to extract doi from pdf and retrieve a pubmed id
#for 'DOI:' syntax
# doi=$(pdftotext -q -f 1 -l 1 $fn - | grep -i "doi:" --max-count=1 | tr [:upper:] [:lower:] | sed -E "s|doi:(.+)|\1|")

# search for doi string between first page last page 10
doi=$(pdftotext -q -f 1 -l 10 $fn - | grep -iE "doi:? ?/?10\." --max-count=1 | tr [:upper:] [:lower:] | sed -E "s|.*doi:? ?/?(10.+)|\1|")


#for 'https://doi.org' syntax
if [ -z "$doi" ]; then
  doi=$(pdftotext -q -f 1 -l 1 $fn - | grep -iE "doi\.org/10\." --max-count=1 | tr [:upper:] [:lower:] | sed -E "s|.+doi\.org/(10.+)|\1|")
fi

# for 'https://doi.org' syntax
# if [ -z "$doi" ]; then
  # doi=$(pdftotext -q -f 1 -l 1 $fn - | grep -i "doi.org/" --max-count=1 | tr [:upper:] [:lower:] | sed -E "s|.+doi\.org\/(.+)|\1|")
# fi
# 
# if [ -z "$doi" ]; then
# doi=$(pdftotext -q -f 1 -l 1 $fn - | grep -iE "doi ?" --max-count=1 | tr [:upper:] [:lower:] | sed -E "s|doi ?(.+)|\1|")
# fi

if [ -z "$doi" ]; then
  echo "doi not found"
  exit 1
fi


## TODO: dedupe this with sdoi.sh
uid=$(curl -s "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term=$doi&field=doi&retmode=xml" | grep -E "<Id>[0-9]+</Id>" | sed -E "s|<Id>([0-9]+)</Id>|\1|")

if [ -z "$uid" ]; then
  echo "pubmed id not found"
  exit 1
fi

#request pubmed xml and transform into bibtex
curl -s "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&id=$uid&retmode=xml" > $uid.xml
xsltproc --novalid $styleSheet $uid.xml > $uid.bib

#extract some strings to make a nice filename for the pdf
key="LastName"; 
author=$(grep $key --max-count=1 $uid.xml | sed -E "s|\W*<$key>(.+)</$key>\W*|\1|" | tr -d " ")

key="MedlineTA"; 
journal=$(grep $key --max-count=1 $uid.xml | sed -E "s|\W*<$key>(.+)</$key>\W*|\1|" | tr -d " ")

key1="PubDate"; 
key2="Year"; year=$(awk "/<$key1>/,/<\/$key1>/" $uid.xml | grep $key2 | sed -E "s|\W*<$key2>(.+)</$key2>\W*|\1|")

fn2=${author}_${journal}$year-$uid.pdf

#move pdf file to papers repository, add file name to bibtex file field
mv $fn $pdfPathOut/$fn2
echo "moved to $pdfPathOut/$fn2"
sed -i -E "s|(\W*file = \{).*(\}.*)|\1$relPath/$fn2\2|" $uid.bib

if [[ -z $(rg $uid $bibdFileOut) ]]; then
  #import bibtex
  echo "importing $uid.bib"
  cat $uid.bib >> $bibdFileOut
else
  echo "$uid already found in $bibdFileOut, exiting"
fi

#clean up
rm $uid.xml $uid.bib
add file 2019-02-11 14:37:48 -08:00			`#!/bin/bash`
helpFound 2020-02-13 12:11:29 -08:00			`if [ "$1" == "-h" ] ; then`
			`echo "`
			`pdf2bib - search for a doi within a pdf, query pubmed, and append bibtex entry with pdf to your local bib database file. Last two steps are identical to sdoi.sh`

			`usage:`
			`pdf2bib.sh file.pdf`

			`depends:`
			`pdftotext - from ghostscript or poppler or texlive ?`
			`xsltproc - xml processor, from GNOME project`
			`pubmed2bibtex.xsl - xml processor stylesheet`

			`defaults:`
			`Set the three required default file locations (xsl file, bib file, pdf directory)`
			`"`
			`exit 0`
			`fi`
add file 2019-02-11 14:37:48 -08:00
			`#Setup defaults`
helpFound 2020-02-13 12:11:29 -08:00			`styleSheet=${pubmedStyleSheet:-$HOME/bin/pubmed2bibtex.xsl}`
			`bibdFileOut=${bibdFileOut:-$HOME/projects/bibd/OMEGA.bib}`
			`pdfPathOut=${pdfPathOut:-$HOME/projects/bibd/papers}`
add file 2019-02-11 14:37:48 -08:00			`relPath=$(basename $pdfPathOut)`
			`fn=$1`

			`set -e #exit if an error`
helpFound 2020-02-13 12:11:29 -08:00
add file 2019-02-11 14:37:48 -08:00			`echo "using $pdfPathOut"`
			`echo "using $bibdFileOut"`

			`#try to extract doi from pdf and retrieve a pubmed id`
helpFound 2020-02-13 12:11:29 -08:00			`#for 'DOI:' syntax`
spring library 2021-04-08 20:52:17 -07:00			`# doi=$(pdftotext -q -f 1 -l 1 $fn - \| grep -i "doi:" --max-count=1 \| tr [:upper:] [:lower:] \| sed -E "s\|doi:(.+)\|\1\|")`
new things 2021-03-30 00:37:43 -07:00
spring library 2021-04-08 20:52:17 -07:00			`# search for doi string between first page last page 10`
			`doi=$(pdftotext -q -f 1 -l 10 $fn - \| grep -iE "doi:? ?/?10\." --max-count=1 \| tr [:upper:] [:lower:] \| sed -E "s\|.*doi:? ?/?(10.+)\|\1\|")`
new things 2021-03-30 00:37:43 -07:00
add file 2019-02-11 14:37:48 -08:00
helpFound 2020-02-13 12:11:29 -08:00			`#for 'https://doi.org' syntax`
			`if [ -z "$doi" ]; then`
spring library 2021-04-08 20:52:17 -07:00			`doi=$(pdftotext -q -f 1 -l 1 $fn - \| grep -iE "doi\.org/10\." --max-count=1 \| tr [:upper:] [:lower:] \| sed -E "s\|.+doi\.org/(10.+)\|\1\|")`
helpFound 2020-02-13 12:11:29 -08:00			`fi`

new things 2021-03-30 00:37:43 -07:00			`# for 'https://doi.org' syntax`
			`# if [ -z "$doi" ]; then`
spring library 2021-04-08 20:52:17 -07:00			`# doi=$(pdftotext -q -f 1 -l 1 $fn - \| grep -i "doi.org/" --max-count=1 \| tr [:upper:] [:lower:] \| sed -E "s\|.+doi\.org\/(.+)\|\1\|")`
new things 2021-03-30 00:37:43 -07:00			`# fi`
			`#`
			`# if [ -z "$doi" ]; then`
spring library 2021-04-08 20:52:17 -07:00			`# doi=$(pdftotext -q -f 1 -l 1 $fn - \| grep -iE "doi ?" --max-count=1 \| tr [:upper:] [:lower:] \| sed -E "s\|doi ?(.+)\|\1\|")`
new things 2021-03-30 00:37:43 -07:00			`# fi`

add file 2019-02-11 14:37:48 -08:00			`if [ -z "$doi" ]; then`
			`echo "doi not found"`
			`exit 1`
			`fi`

helpFound 2020-02-13 12:11:29 -08:00
add file 2019-02-11 14:37:48 -08:00			`## TODO: dedupe this with sdoi.sh`
spring library 2021-04-08 20:52:17 -07:00			`uid=$(curl -s "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term=$doi&field=doi&retmode=xml" \| grep -E "<Id>[0-9]+</Id>" \| sed -E "s\|<Id>([0-9]+)</Id>\|\1\|")`
add file 2019-02-11 14:37:48 -08:00
			`if [ -z "$uid" ]; then`
			`echo "pubmed id not found"`
			`exit 1`
			`fi`

			`#request pubmed xml and transform into bibtex`
			`curl -s "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&id=$uid&retmode=xml" > $uid.xml`
			`xsltproc --novalid $styleSheet $uid.xml > $uid.bib`

			`#extract some strings to make a nice filename for the pdf`
			`key="LastName";`
spring library 2021-04-08 20:52:17 -07:00			`author=$(grep $key --max-count=1 $uid.xml \| sed -E "s\|\W<$key>(.+)</$key>\W\|\1\|" \| tr -d " ")`
add file 2019-02-11 14:37:48 -08:00
			`key="MedlineTA";`
spring library 2021-04-08 20:52:17 -07:00			`journal=$(grep $key --max-count=1 $uid.xml \| sed -E "s\|\W<$key>(.+)</$key>\W\|\1\|" \| tr -d " ")`
add file 2019-02-11 14:37:48 -08:00
			`key1="PubDate";`
spring library 2021-04-08 20:52:17 -07:00			`key2="Year"; year=$(awk "/<$key1>/,/<\/$key1>/" $uid.xml \| grep $key2 \| sed -E "s\|\W<$key2>(.+)</$key2>\W\|\1\|")`
add file 2019-02-11 14:37:48 -08:00
			`fn2=${author}_${journal}$year-$uid.pdf`

			`#move pdf file to papers repository, add file name to bibtex file field`
			`mv $fn $pdfPathOut/$fn2`
			`echo "moved to $pdfPathOut/$fn2"`
			`sed -i -E "s\|(\Wfile = \{).(\}.*)\|\1$relPath/$fn2\2\|" $uid.bib`

			`if [[ -z $(rg $uid $bibdFileOut) ]]; then`
			`#import bibtex`
			`echo "importing $uid.bib"`
			`cat $uid.bib >> $bibdFileOut`
			`else`
			`echo "$uid already found in $bibdFileOut, exiting"`
			`fi`

			`#clean up`
			`rm $uid.xml $uid.bib`