An unintrusive frontend of scanimage which acts as a paper to pdf converter suitable for texts.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
This repo is archived. You can view files and clone it, but cannot push or open issues/pull-requests.
spectrscan/spectrscan

250 lines
6.4 KiB

#!/usr/bin/env bash
#
# spectrscan
#
# Copyright (C) 2017-2018 frnmst (Franco Masotti) <franco.masotti@live.com>
#
# This file is part of spectrscan.
#
# spectrscan is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# spectrscan is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with spectrscan. If not, see <http://www.gnu.org/licenses/>.
#
. ./spectrscan.conf
check_software()
{
which \
scanimage \
unpaper \
awk \
convert \
pdftk \
parallel \
file \
pamfix \
sed \
tr \
sort \
ls \
head
}
get_last_page_side_a()
{
# remove trailing zeros with sed.
ls spectrscan_out*.pnm | tr -d [a-z]._ | sed -e 's/^[0]*//' | sort --reverse --numeric | head -n1
}
select_page_parameters()
{
if [ "${two_sided}" = 'true' ] && [ "${side_b}" = 'true' ]; then
# side_b
batch_start=$(("$(get_last_page_side_a)"+1))
batch_increment=-2
elif [ "${two_sided}" = 'reverse' ] && [ "${side_b}" = 'true' ]; then
# side_b
batch_start=2
batch_increment=2
fi
if [ "${two_sided}" != 'false' ] && [ "${side_a}" = 'true' ]; then
# true, reverse; first side.
batch_start=1
batch_increment=2
fi
if [ "${two_sided}" = 'false' ]; then
# single sided
batch_start=1
batch_increment=1
fi
}
build_scan_sub_command()
{
# Since these are scanner specific options, they need to be probed.
#
# Put source option before resolution to avoid the error:
# https://bugs.launchpad.net/simple-scan/+bug/983441
local scan_command=''
[ -n "$(get_supported_resolutions)" ] && scan_command=""${scan_command}" --source "${source}""
[ -n "$(get_supported_modes)" ] && scan_command=""${scan_command}" --resolution "${resolution}""
[ -n "$(get_supported_sources)" ] && scan_command=""${scan_command}" --mode "${mode}""
echo "${scan_command}"
}
scan()
{
select_page_parameters
local scan_sub_command="$(build_scan_sub_command)"
scanimage \
${scan_sub_command} \
--batch=spectrscan_out%08d.pnm \
--batch-start ${batch_start} \
--batch-increment ${batch_increment} \
--progress \
--format=pnm
}
exists_output_file()
{
# Check if output file exists and is a pdf file.
# This is needed to concatenate new documents to an
# already existing pdf file.
if [ -f "$SRC_DIR"/"$output_file" ]; then
if [ "$(file --mime-type "$SRC_DIR"/"$output_file" \
| awk '{ print $2 }')" = "application/pdf" ]; then
echo 'true'
else
# Not a pdf file.
echo 'error'
fi
else
echo 'false'
fi
}
convert_single()
{
# First convert to pdf with the image enhancements, then apply unpaper.
# This gives better results because it avoids missing information
# removed by unpaper in the final result.
local filE="${1}"
pamfix -truncate "$filE" > "$filE".tmp.pnm.bis
if [ "${imagemagick_options}" != 'false' ]; then
convert ${imagemagick_options} "$filE".tmp.pnm.bis "$filE".tmp.pnm
else
mv "$filE".tmp.pnm.bis "$filE".tmp.pnm
fi
if [ "${unpaper_options}" = 'true' ]; then
unpaper --overwrite -q "$filE".tmp.pnm "$filE".pnm
elif [ "${unpaper_options}" = 'false' ]; then
mv "$filE".tmp.pnm "$filE".pnm
else
unpaper --overwrite -q ${unpaper_options} "$filE".tmp.pnm "$filE".pnm
fi
#
# OCR stuff goes here.
#
convert -compress lzw "$filE".pnm "$filE".pdf
} 1>/dev/null 2>/dev/null
# TODO use inotify || add option to post process if not available
# (eg: sshfs filesystem).
# process in /tmp if possible. -> the previous check is not needed.
pnm_to_pdf()
{
# A hack to avoid GNU Parallel's message
mkdir -p ~/.parallel && touch ~/.parallel/will-cite
# n = number of new pages
# Time complexity: O(n/#threads)
export -f convert_single
export unpaper_options
export imagemagick_options
parallel --bar convert_single ::: "$(ls spectrscan_*.pnm)"
}
pdf_cat()
{
printf "%s\n" 'assembling PDF...'
# O(n)
# Always cat to the output file.
# Unlike pdfunite, pdftk does not corrupt the pdf.
pdftk *.pdf cat output "$output_file".tmp
# O(1)
if [ "$(exists_output_file)" = 'true' ]; then
cp "$SRC_DIR"/"$output_file" "$SRC_DIR"/."$output_file"
pdftk "$SRC_DIR"/."$output_file" "$output_file".tmp \
cat output "$SRC_DIR"/"$output_file"
rm "$SRC_DIR"/."$output_file"
elif [ "$(exists_output_file)" = 'false' ]; then
mv "$output_file".tmp "$SRC_DIR"/"$output_file"
else
printf "[ERROR]\n" 1>&2-
exit 1
fi
}
get_supported_parameters()
{
scanimage -A | grep -e '--source' -e '--mode' -e '--resolution'
}
get_supported_resolutions()
{
echo "${supported_params}" | grep resolution | awk '{print $2}' | tr '|' ' ' | tr -d 'dpi'
}
get_supported_modes()
{
echo "${supported_params}" | grep mode | awk '{print $2}' | tr '|' ' '
}
get_supported_sources()
{
echo "${supported_params}" | grep source | awk '{print $2}' | tr '|' ' '
}
chain()
{
mkdir "${TMP_DIR}"
pushd "${TMP_DIR}"
if [ "${two_sided}" != 'false' ]; then
side_a='true'
scan \
&& printf "%s\n" 'turn the paper(s) and hit return when ready' \
&& read \
&& side_a='false'; side_b='true' \
&& scan
else
scan
fi
pnm_to_pdf
pdf_cat
popd
rm -rf "${TMP_DIR}"
}
preliminary_checks()
{
# Leak all variables to avoid passing them every time.
imagemagick_options="${1}"
mode="${2}"
resolution="${3}"
source="${4}"
unpaper_options="${5}"
two_sided="${6}"
output_file="${7}"
# Get the parameters and check that the scanner is connected.
supported_params="$(get_supported_parameters)"
# FIXME: this might not work.
local retval=${?}
[ ${retval} -ne 0 ] && return ${retval}
chain
}
check_software || exit ${?}
. ./fbopt