Script to get filename stats

Magnifying glass

I wanted to generate word statistics for a big amount of files, so I wrote this script. What it does is to remove special characters from filenames, and to split the filename into words. The extension is thrown away.

Raw
#!/usr/bin/env python
import argparse
import configparser
import os
import pathlib
import re

import argcomplete


mincount = 10


def getconf():
    config = configparser.ConfigParser()
    configfile = os.path.join(pathlib.Path.home(), ".config", "filenamestat.ini")
    if not os.path.isfile(configfile):
        config["DEFAULT"]["minlen"] = "3"
        config["DEFAULT"]["mincount"] = "10"
        config["DEFAULT"]["stopwords"] = "and,the,for"
        with open(configfile, "w") as fh:
            config.write(fh)
        return config
    config.read(configfile)
    return config


def filenamestats(paths):
    """
    Finds all files in a directory and returns name stats

    :param path: str, path
    :returns: dict where the keys are found words and the values are the
              occurences.
    """
    words = {}
    for path in paths:
        for directory in os.walk(path):
            for filename in directory[2]:
                # Remove extension
                filename = os.path.splitext(filename.strip())[0].lower()
                # Remove non-alphanumeric
                filename = re.sub(r"[\W_]+", " ", filename)
                # Remove excessive whitespace
                filename = re.sub(r"\s+", " ", filename)
                filenameparts = filename.split(" ")
                for word in filenameparts:
                    words.setdefault(word, 0)
                    words[word] += 1
    return words


def dumpstats(words, stopwords=None, minlen=3, mincount=mincount):
    stopwords = stopwords or []
    words = sorted(words.items(), key=lambda x: x[1], reverse=True)
    for (word, count) in words:
        if word in stopwords:
            continue
        if len(word) >= minlen and count >= mincount:
            print(f"{word}: {count}")


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Filename stats")
    parser.add_argument("paths", type=str, nargs="*", default=".", help="Path to search")
    parser.add_argument(
        "--mincount",
        "-m",
        type=int,
        default=mincount,
        help=f"Show only results with that many matches ({mincount})",
    )
    parser.add_argument(
        "--stopwords",
        "-s",
        nargs="*",
        default=[],
        help="Ignore those words",
    )
    args = parser.parse_args()
    argcomplete.autocomplete(parser)
    config = getconf()
    stopwords = args.stopwords + config["DEFAULT"]["stopwords"].split(",")
    mincount = args.mincount or int(config["DEFAULT"]["mincount"])
    words = filenamestats(args.paths)
    dumpstats(
        words,
        stopwords=stopwords,
        minlen=int(config["DEFAULT"]["minlen"]),
        mincount=mincount,
    )

0 comments

Reply

Cancel reply
Markdown. Syntax highlighting with <code lang="php"><?php echo "Hello, world!"; ?></code> etc.
DjangoPythonBitcoinTuxDebianHTML5 badgeSaltStackUpset confused bugMoneyHackerUpset confused bugX.OrggitFirefoxWindowMakerBashIs it worth the time?i3 window managerWagtailContainerIrssiNginxSilenceUse a maskWorldInternet securityPianoFontGnuPGThunderbirdJenkinshome-assistant-logo