[Techtalk] interesting photo management problem-- weeding out duplicates

Peggy Russell prusselltechgroup at gmail.com
Wed Apr 30 07:13:48 UTC 2014


Hi Carla,

Like yourself and others, I use a variation of a checksum, bash associative
arrays, and process substitution. I create two files. One is a simple list 
and the other contains the commands which can be executed later after a
quick review. A snippet is shown below: 

function find_duplicates() {
  while read -r checksum filename; do

    if [[ "$(stat --format='%s' "${filename}")" -eq 0 ]]; then
      ((numEmpty++))
      printf -- 'XXXX %s is empty\n' "${filename}" >>"${dupList}"
      continue
    fi

    if [[ -n "${files[$checksum]:+set}" ]]; then
      ((numDups++))
      # create list
      printf -- '%04d %s\n'  "${numDups}" "${filename}" >>"${dupList}"
      printf -- '%04d %s\n'  "${numDups}" "${files[$checksum]}" >>"${dupList}"
      # create commands
      printf -- '# %d) %s = %s\n'  "${numDups}" "${filename}" "${files[$checksum]}"
      # make filenames unique, noclobber, handle spaces
      newFilename="${filename}-$(date '+%Y%m%d%H%M%S')"
      printf -- 'mv "%s" "%s"\n' "${filename}" "${newFilename}"
      printf -- 'mv --target-directory=%s "%s"\n\n' "${dupDir}" "${newFilename}"
    else
      # add file
      files[$checksum]="${filename}"
    fi

  done < <(find ${@} -type f -exec sha256sum {} +)  >"${dupCommands}"
}

: >"${dupList}" >"${dupCommands}"
find_duplicates "${@}"

printf -- '   Unique files: %6d\n' "${#files[@]}"
printf -- 'Duplicate files: %6d\n' "${numDups}"
printf -- '    Empty files: %6d\n' "${numEmpty}"
printf -- '    Total files: %6d\n' "$((${#files[@]} + numDups + numEmpty))"

Peg Russell


More information about the Techtalk mailing list