#!/bin/bash # # # #set -x VERSION="0.1.1" usage() { echo "$0 [-d directory][-a [ALL|AGE|ACCESS_TIME|MODIF_TIME|SIZE]][-K][-F][-u][-g][-c|-b][-q][-Q][-G <user/group/both/filetype/all>][-v][-h] [-d <dir>]: the directory to analyze (default is '.'), [-a <COMPUTE>]: the type of analyze to perform (default is 'ALL' [*]), [-K]: keep previous results (default is to clean previous results), [-F]: use the file command for each find results (time consumming++; default is to not do it), [-u]: add the user to every find results (default is to not do it), [-g]: add the group to every find results (default is to not do it), [-c]: add the creation time to every find results (default is to not do it). [-b]: same as -c (birth time) [-q]: do not display any notice/warning informations, [-Q]: just performs operations. Do not display anything. [-G <user|group|both|filetype/all>]: display statistics grouped by user (implies -u), group(implies -g), both (implies -ug), filetype (implies -F), all (implies -Fug), or none. Default is to not group the results (none). [-v]: print version and exit [-h]: print this help and exit [*]: ALL is equivalent to AGE[**] + SIZE statistics [**]: AGE = ACCESS_TIME + MODIF_TIME statistics" } version() { echo ${VERSION}; } KEEP=false QUIET=false HIDE=false WITH_FILE_COMMAND=false WITH_USER=false WITH_GROUP=false WITH_CREATION_TIME=false GROUPED_BY="none" while getopts ":h:d:a:KFugcbqQG:v" opt; do case $opt in a) _COMPUTE=${OPTARG} ;; d) _DIR=${OPTARG} ;; K) KEEP=true ;; u) WITH_USER=true ;; g) WITH_GROUP=true ;; c|b) WITH_CREATION_TIME=true ;; F) WITH_FILE_COMMAND=true ;; Q) HIDE=true ;; q) QUIET=true ;; G) GROUPED_BY=${OPTARG} ;; v) version exit 0 ;; *|h) usage exit 0 ;; esac done shift "$((OPTIND-1))" if [[ -z "${_COMPUTE}" ]]; then _COMPUTE="ALL" fi if [[ -z "${_DIR}" ]]; then _DIR="." fi check_commands() { # checking if miller is installed _MLR=$(command -v mlr) # checking if datamash is installed _DATAMASH=$(command -v datamash) # checking if GNU awk is installed _GAWK=$(command -v awk) } backup() { _DATE=$(date +"%Y%m%d+%H%M%S") mv all_files_reltimes.tsv all_files_reltimes.tsv."${_DATE}" 2>/dev/null mv all_files_timestamps.csv all_files_timestamps.csv."${_DATE}" 2>/dev/null mv all_files_timestamps_space_replaced.csv all_files_timestamps_space_replaced.csv."${_DATE}" 2>/dev/null mv all_files_timestamps_space_replaced.csv all_files_special_chars_replaced.csv."${_DATE}" 2>/dev/null mv files_usage_stats.csv files_usage_stats.csv."${_DATE}" 2>/dev/null mv files_access_stats.csv files_access_stats.csv."${_DATE}" 2>/dev/null mv files_modifs_stats.csv files_modifs_stats.csv."${_DATE}" 2>/dev/null for _Analyze_op in "ALL" "AGE" "SIZE" "ACCESS_TIME" "MODIF_TIME"; do mv files_${_Analyze_op}_stats.out files_${_Analyze_op}_stats.out."${_DATE}" 2>/dev/null done } clean() { rm all_files_reltimes.tsv 2>/dev/null rm all_files_timestamps{,_space_replaced}.csv 2>/dev/null rm all_files_special_chars_replaced.csv 2>/dev/null rm files_{usage,access,modifs}_stats.csv 2>/dev/null for _Analyze_op in "ALL" "AGE" "SIZE" "ACCESS_TIME" "MODIF_TIME"; do rm files_${_Analyze_op}_stats.out 2>/dev/null done } SCRIPTPATH="$( cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )" check_commands if [ -z "${_GAWK}" ];then echo "Sorry, but GNU awk is required to ruen this program... Exiting" exit 1 fi if [ "${KEEP}" == true ]; then backup else clean fi # End Of find Command "EOC" # using ::/:: as separator to avoid problematic filenames # ('/' is a forbidden character for files on linux) EOC="%p::/::%s::/::%A@::/::%T@::/::__birth__::/::__user__::/::__group__" NL="\n" if [[ "${WITH_CREATION_TIME}" == true ]]; then EOC="${EOC//__birth__/%B@}" fi if [[ "${WITH_USER}" == true ]] || [[ "${GROUPED_BY}" == "user" ]] || [[ "${GROUPED_BY}" == "both" ]] || [[ "${GROUPED_BY}" == "all" ]]; then EOC="${EOC//__user__/%u}" fi if [[ "${WITH_GROUP}" == true ]] || [[ "${GROUPED_BY}" == "group" ]] || [[ "${GROUPED_BY}" == "both" ]] || [[ "${GROUPED_BY}" == "all" ]]; then EOC="${EOC//__group__/%g}" fi if [[ "${WITH_FILE_COMMAND}" == true ]] || [[ "${GROUPED_BY}" == "filetype" ]] || [[ "${GROUPED_BY}" == "all" ]]; then find "${_DIR}" -type f -printf "${EOC}::/::" -exec file -b {} \; > all_files_timestamps.csv else # the file command has a NL character even with "-b" (brief) find "${_DIR}" -type f -printf "${EOC}""${NL}" > all_files_timestamps.csv fi CUR_EPOCH=$(date +%s) EPOCH_DAYS=$(( "${CUR_EPOCH}"/(3600*24) )) if [[ "${HIDE}" == false ]] && [[ "${QUIET}" == false ]]; then echo "Current timestamp: ${CUR_EPOCH} seconds since EPOCH (01/01/1970)" echo "That is ${EPOCH_DAYS} days. If a field displays ${EPOCH_DAYS}[,.]*, then you may have a ~'noatime' mounted filesystem." echo "+~^~^~^~^~^~^~^~^~^~^~^~^~^~^~^~^~^~^~^~^~^~^~^~^~^~^~^~^~^~^~~^~^~^~^~^~^~^~^~^~^~^~^~^~^~^~^~^~^~^~^~^~^~^~+" echo '|/!\ Replacing all space characters in filenames with "-.sp.-"; |' echo '| + Replacing all " characters in filenames with "--.dq.--"; |' echo '| Files are not renamed, only the filenames used by this script in its temporary {*.csv,*.tsv} files. |' echo '| If needed, do a diff between "all_files_timestamps.csv" and "all_files_special_chars_replaced.csv". |' echo '|[ Command ===>>> "diff all_files_timestamps.csv all_files_special_chars_replaced.csv" ]|' echo "'~^~^~^~^~^~^~^~^~^~^~^~^~^~^~^~^~^~^~^~^~^~^~^~^~^~^~^~^~^~^~~^~^~^~^~^~^~^~^~^~^~^~^~^~^~^~^~^~^~^~^~^~^~^~'" fi # removing spaces sed -re "s/\s/-.sp.-/g" all_files_timestamps.csv > all_files_timestamps_space_replaced.csv # removing double quotes from filenames sed -e 's/"/-.dq.-/g' all_files_timestamps_space_replaced.csv > all_files_special_chars_replaced.csv echo -e "Filename\tSize\tLast access (Days)\tLast modif. (Days)\tBirth date (Days)\tuser\tgroup\tFiletype" > all_files_reltimes.tsv ${_GAWK} -F"::/::" -v OFS="\t" -v EPOCH="${CUR_EPOCH}" -v OFMT="%.2f" '{print $1,$2,(EPOCH-$3)/(3600*24),(EPOCH-$4)/(3600*24),(EPOCH-$5)/(3600*24),$6,$7,$8;}' all_files_special_chars_replaced.csv >> all_files_reltimes.tsv miller_f () { stat_results="files_${_COMPUTE}_stats.out" if [[ "${_COMPUTE}" == "ALL" ]] || [[ "${_COMPUTE}" == "SIZE" ]]; then if [[ "${HIDE}" == false ]]; then echo "Top 5 biggest files:" |tee -a "${stat_results}" ${_MLR} --itsv --opprint sort -nr Size ${1} then head -n5 all_files_reltimes.tsv |tee -a "${stat_results}" else echo "Top 5 biggest files:" >> "${stat_results}" ${_MLR} --itsv --opprint sort -nr Size ${1} then head -n5 all_files_reltimes.tsv >> "${stat_results}" fi fi if [[ "${_COMPUTE}" == "ALL" ]] || [[ "${_COMPUTE}" == "AGE" ]] || [[ "${_COMPUTE}" == "ACCESS_TIME" ]]; then if [[ "${HIDE}" == false ]]; then echo "Top 5 oldest accessed files:" |tee -a "${stat_results}" ${_MLR} --itsv --opprint sort -nr "Last access (Days)" ${1} then head -n5 all_files_reltimes.tsv |tee -a "${stat_results}" else echo "Top 5 oldest accessed files:" >> "${stat_results}" ${_MLR} --itsv --opprint sort -nr "Last access (Days)" ${1} then head -n5 all_files_reltimes.tsv >> "${stat_results}" fi fi if [[ "${_COMPUTE}" == "ALL" ]] || [[ "${_COMPUTE}" == "AGE" ]] || [[ "${_COMPUTE}" == "MODIF_TIME" ]]; then if [[ "${HIDE}" == false ]]; then echo "Top 5 oldest modified files:" |tee -a "${stat_results}" ${_MLR} --itsv --opprint sort -nr "Last modif. (Days)" ${1} then head -n5 all_files_reltimes.tsv |tee -a "${stat_results}" else echo "Top 5 oldest modified files:" >> "${stat_results}" ${_MLR} --itsv --opprint sort -nr "Last modif. (Days)" ${1} then head -n5 all_files_reltimes.tsv >> "${stat_results}" fi fi if [[ "${HIDE}" == false ]] && [[ "${QUIET}" == false ]]; then echo "Overall statistics..." fi if [[ "${_COMPUTE}" == "ALL" ]] || [[ "${_COMPUTE}" == "SIZE" ]]; then if [[ "${HIDE}" == false ]]; then echo "File size statistics:" |tee -a "${stat_results}" ${_MLR} --itsv --opprint --from all_files_reltimes.tsv stats1 -a min,mean,max,sum,median,p25,p75,stddev,count -f Size then put '$Max_GB = $Size_max/(1024**3); $Total_of_files_GB = $Size_sum/(1024**3);' |tee -a "${stat_results}" else echo "File size statistics:" >> "${stat_results}" ${_MLR} --itsv --opprint --from all_files_reltimes.tsv stats1 -a min,mean,max,sum,median,p25,p75,stddev,count -f Size then put '$Max_GB = $Size_max/(1024**3); $Total_of_files_GB = $Size_sum/(1024**3);' >> "${stat_results}" fi fi if [[ "${_COMPUTE}" == "ALL" ]] || [[ "${_COMPUTE}" == "AGE" ]] || [[ "${_COMPUTE}" == "ACCESS_TIME" ]]; then if [[ "${HIDE}" == false ]]; then echo "File access time statistics:" |tee -a "${stat_results}" ${_MLR} --itsv --opprint --from all_files_reltimes.tsv stats1 -a min,max,mean,median,p25,p75,stddev,count -f "Last access (Days)" |tee -a "${stat_results}" else echo "File access time statistics:" >> "${stat_results}" ${_MLR} --itsv --opprint --from all_files_reltimes.tsv stats1 -a min,max,mean,median,p25,p75,stddev,count -f "Last access (Days)" >> "${stat_results}" fi fi if [[ "${_COMPUTE}" == "ALL" ]] || [[ "${_COMPUTE}" == "AGE" ]] || [[ "${_COMPUTE}" == "MODIF_TIME" ]]; then if [[ "${HIDE}" == false ]]; then echo "File modification time statistics:" |tee -a "${stat_results}" ${_MLR} --itsv --opprint --from all_files_reltimes.tsv stats1 -a min,max,mean,median,p25,p75,stddev,count -f "Last modif. (Days)" |tee -a "${stat_results}" else echo "File modification time statistics:" >> "${stat_results}" ${_MLR} --itsv --opprint --from all_files_reltimes.tsv stats1 -a min,max,mean,median,p25,p75,stddev,count -f "Last modif. (Days)" >> "${stat_results}" fi fi } datamash_f() { basic_sort_f "${QUIET}" "${HIDE}" "${_COMPUTE}" if [[ "${_COMPUTE}" == "ALL" ]] || [[ "${_COMPUTE}" == "SIZE" ]]; then ${_DATAMASH} -H max 2 q1 2 mean 2 q3 2 median 2 sum 2 pstdev 2 min 2 count 2 < all_files_reltimes.tsv > files_usage_stats.csv if [[ "${HIDE}" == false ]]; then echo "File size statistics" ${_GAWK} -v OFMT="%.2f" '{if($NF ~ "[[:digit:]]") {print $1/(1024**3)"GB",$2"bytes", $3/1024 "kbytes", $4 "bytes", $5 "bytes", $6/(1024**3)"GB", $7/1024"kbytes", $8 "bytes", $9 "files";} else {print;}}' files_usage_stats.csv | column -t fi fi if [[ "${_COMPUTE}" == "ALL" ]] || [[ "${_COMPUTE}" == "AGE" ]] || [[ "${_COMPUTE}" == "ACCESS_TIME" ]]; then LC_NUMERIC="en_US.UTF-8" ${_DATAMASH} -H min 3 max 3 q1 3 mean 3 q3 3 median 3 pstdev 3 count 3 < all_files_reltimes.tsv > files_access_stats.csv if [[ "${HIDE}" == false ]]; then echo "File access statistics" column -t files_access_stats.csv fi fi if [[ "${_COMPUTE}" == "ALL" ]] || [[ "${_COMPUTE}" == "AGE" ]] || [[ "${_COMPUTE}" == "MODIF_TIME" ]]; then LC_NUMERIC="en_US.UTF-8" ${_DATAMASH} -H min 4 max 4 q1 4 mean 4 q3 4 median 4 pstdev 4 count 4 < all_files_reltimes.tsv > files_modifs_stats.csv if [[ "${HIDE}" == false ]]; then echo "File modification statistics" column -t files_modifs_stats.csv fi fi } awk_f() { basic_sort_f "${QUIET}" "${HIDE}" "${_COMPUTE}" if [[ "${_COMPUTE}" == "ALL" ]] || [[ "${_COMPUTE}" == "SIZE" ]]; then echo "Size statistics (bytes)...." echo -e "Sum\tFile_count\tAverage\tMedian\tstdev\tMin\tMax" > files_usage_stats.csv ${_GAWK} '{print $2}' all_files_reltimes.tsv | sort -n | ${_GAWK} -f "${SCRIPTPATH}"/awk_stats_on_1st_field.awk >> files_usage_stats.csv if [[ "${HIDE}" == false ]]; then ${_GAWK} -v OFMT="%.2f" '{if($NF ~ "[[:digit:]]") {print $1/(1024**3)"GB",$2, $3/1024 "kbytes", $4/1024 "kbytes", $5/1024 "kbytes", $6 "bytes", $7/(1024**3)"GB";} else {print;}}' files_usage_stats.csv | column -t fi fi if [[ "${_COMPUTE}" == "ALL" ]] || [[ "${_COMPUTE}" == "AGE" ]] || [[ "${_COMPUTE}" == "ACCESS_TIME" ]]; then echo "Access time statistics...." echo -e "Sum\tFile_count\tAverage\tMedian\tstdev\tMin\tMax" > files_access_stats.csv ${_GAWK} '{print $3}' all_files_reltimes.tsv | sort -n | ${_GAWK} -f "${SCRIPTPATH}"/awk_stats_on_1st_field.awk >> files_access_stats.csv if [[ "${HIDE}" == false ]]; then column -t files_access_stats.csv fi fi if [[ "${_COMPUTE}" == "ALL" ]] || [[ "${_COMPUTE}" == "AGE" ]] || [[ "${_COMPUTE}" == "MODIF_TIME" ]]; then echo "Modif. time statistics...." echo -e "Sum\tFile_count\tAverage\tMedian\tstdev\tMin\tMax" > files_modifs_stats.csv ${_GAWK} '{print $4}' all_files_reltimes.tsv | sort -n | ${_GAWK} -f "${SCRIPTPATH}"/awk_stats_on_1st_field.awk >> files_modifs_stats.csv if [[ "${HIDE}" == false ]]; then column -t files_modifs_stats.csv fi fi } basic_sort_f() { stat_results="files_${3}_stats.out" if [[ "${3}" == "ALL" ]] || [[ "${3}" == "SIZE" ]]; then if [[ "${2}" == false ]]; then echo "Top 5 biggest files:" |tee -a "${stat_results}" tail -n +2 all_files_reltimes.tsv |sort -nr -k2 |head -5 |tee -a "${stat_results}" else echo "Top 5 biggest files:" >> "${stat_results}" tail -n +2 all_files_reltimes.tsv |sort -nr -k2 |head -5 >> "${stat_results}" fi fi if [[ "${3}" == "ALL" ]] || [[ "${3}" == "AGE" ]] || [[ "${3}" == "ACCESS_TIME" ]]; then if [[ "${2}" == false ]]; then echo "Top 5 oldest accessed files:" |tee -a "${stat_results}" tail -n +2 all_files_reltimes.tsv |sort -nr -k3 |head -5 |tee -a "${stat_results}" else echo "Top 5 oldest accessed files:" >> "${stat_results}" tail -n +2 all_files_reltimes.tsv |sort -nr -k3 |head -5 |tee -a "${stat_results}" >> "${stat_results}" fi fi if [[ "${3}" == "ALL" ]] || [[ "${3}" == "AGE" ]] || [[ "${3}" == "MODIF_TIME" ]]; then if [[ "${2}" == false ]]; then echo "Top 5 oldest modified files:" |tee -a "${stat_results}" tail -n +2 all_files_reltimes.tsv |sort -nr -k4 |head -5 |tee -a "${stat_results}" else echo "Top 5 oldest modified files:" >> "${stat_results}" tail -n +2 all_files_reltimes.tsv |sort -nr -k4 |head -5 >> "${stat_results}" fi fi } count_users() { if [[ "${2}" == false ]]; then echo "Top 5 users:" |tee -a "${3}" ${_GAWK} 'NR==1 {next}; {(NF>=6 && NR>1) b[$6]++} END{for(i in b) print i, b[i]"/"(NR-1), b[i]*100/(NR-1)"%"}' all_files_reltimes.tsv |sort -nr -k2 |head -5 |tee -a "${3}" else echo "Top 5 filetypes:" >> "${3}" ${_GAWK} 'NR==1 {next}; {(NF>=6 && NR>1) b[$6]++} END{for(i in b) print i, b[i]"/"(NR-1), b[i]*100/(NR-1)"%"}' all_files_reltimes.tsv |sort -nr -k2 |head -5 >> "${3}" fi } count_groups() { if [[ "${2}" == false ]]; then echo "Top 5 groups:" |tee -a "${3}" ${_GAWK} 'NR==1 {next}; {(NF>=7 && NR>1) b[$7]++} END{for(i in b) print i, b[i]"/"(NR-1), b[i]*100/(NR-1)"%"}' all_files_reltimes.tsv |sort -nr -k2 |head -5 |tee -a "${3}" else echo "Top 5 filetypes:" >> "${3}" ${_GAWK} 'NR==1 {next}; {(NF>=7 && NR>1) b[$7]++} END{for(i in b) print i, b[i]"/"(NR-1), b[i]*100/(NR-1)"%"}' all_files_reltimes.tsv |sort -nr -k2 |head -5 >> "${3}" fi } count_filetypes() { if [[ "${2}" == false ]]; then echo "Top 5 filetypes:" |tee -a "${3}" ${_GAWK} 'NR==1 {next}; {(NF>=8 && NR>1) b[$8]++} END{for(i in b) print i, b[i]"/"(NR-1), b[i]*100/(NR-1)"%"}' all_files_reltimes.tsv |sort -nr -k2 |head -5 |tee -a "${3}" else echo "Top 5 filetypes:" >> "${3}" ${_GAWK} 'NR==1 {next}; {(NF>=8 && NR>1) b[$8]++} END{for(i in b) print i, b[i]"/"(NR-1), b[i]*100/(NR-1)"%"}' all_files_reltimes.tsv |sort -nr -k2 |head -5 >> "${3}" fi } if [[ -n ${_MLR} ]]; then GROUP_CMD="" stat_results="files_${_COMPUTE}_stats.out" if [[ "${GROUPED_BY}" == "both" ]]; then GROUP_CMD="then group-by user,group" count_users "${QUIET}" "${HIDE}" "${stat_results}" count_groups "${QUIET}" "${HIDE}" "${stat_results}" elif [[ "${GROUPED_BY}" == "user" ]]; then GROUP_CMD="then group-by user" count_users "${QUIET}" "${HIDE}" "${stat_results}" elif [[ "${GROUPED_BY}" == "group" ]]; then GROUP_CMD="then group-by group" count_groups "${QUIET}" "${HIDE}" "${stat_results}" elif [[ "${GROUPED_BY}" == "filetype" ]]; then GROUP_CMD="then group-by Filetype" count_filetypes "${QUIET}" "${HIDE}" "${stat_results}" elif [[ "${GROUPED_BY}" == "all" ]]; then GROUP_CMD="then group-by user,group,Filetype" count_users "${QUIET}" "${HIDE}" "${stat_results}" count_groups "${QUIET}" "${HIDE}" "${stat_results}" count_filetypes "${QUIET}" "${HIDE}" "${stat_results}" fi miller_f "${GROUP_CMD}" elif [[ -n ${_DATAMASH} ]]; then datamash_f stat_results="files_${_COMPUTE}_stats.out" if [[ "${GROUPED_BY}" == "both" ]]; then count_users "${QUIET}" "${HIDE}" "${stat_results}" count_groups "${QUIET}" "${HIDE}" "${stat_results}" elif [[ "${GROUPED_BY}" == "user" ]]; then count_users "${QUIET}" "${HIDE}" "${stat_results}" elif [[ "${GROUPED_BY}" == "group" ]]; then count_groups "${QUIET}" "${HIDE}" "${stat_results}" elif [[ "${GROUPED_BY}" == "filetype" ]]; then count_filetypes "${QUIET}" "${HIDE}" "${stat_results}" elif [[ "${GROUPED_BY}" == "all" ]]; then count_users "${QUIET}" "${HIDE}" "${stat_results}" count_groups "${QUIET}" "${HIDE}" "${stat_results}" count_filetypes "${QUIET}" "${HIDE}" "${stat_results}" fi else awk_f stat_results="files_${_COMPUTE}_stats.out" if [[ "${GROUPED_BY}" == "both" ]]; then count_users "${QUIET}" "${HIDE}" "${stat_results}" count_groups "${QUIET}" "${HIDE}" "${stat_results}" elif [[ "${GROUPED_BY}" == "user" ]]; then count_users "${QUIET}" "${HIDE}" "${stat_results}" elif [[ "${GROUPED_BY}" == "group" ]]; then count_groups "${QUIET}" "${HIDE}" "${stat_results}" elif [[ "${GROUPED_BY}" == "filetype" ]]; then count_filetypes "${QUIET}" "${HIDE}" "${stat_results}" elif [[ "${GROUPED_BY}" == "all" ]]; then count_users "${QUIET}" "${HIDE}" "${stat_results}" count_groups "${QUIET}" "${HIDE}" "${stat_results}" count_filetypes "${QUIET}" "${HIDE}" "${stat_results}" fi fi