.. only:: html

    .. note::
        :class: sphx-glr-download-link-note

        Click :ref:`here <sphx_glr_download_auto_examples_text_plot_hashing_vs_dict_vectorizer.py>`     to download the full example code or to run this example in your browser via Binder
    .. rst-class:: sphx-glr-example-title

    .. _sphx_glr_auto_examples_text_plot_hashing_vs_dict_vectorizer.py:


===========================================
FeatureHasher and DictVectorizer Comparison
===========================================

Compares FeatureHasher and DictVectorizer by using both to vectorize
text documents.

The example demonstrates syntax and speed only; it doesn't actually do
anything useful with the extracted vectors. See the example scripts
{document_classification_20newsgroups,clustering}.py for actual learning
on text documents.

A discrepancy between the number of terms reported for DictVectorizer and
for FeatureHasher is to be expected due to hash collisions.


.. rst-class:: sphx-glr-script-out

 Out:

 .. code-block:: none


    Usage: /home/circleci/project/examples/text/plot_hashing_vs_dict_vectorizer.py [n_features_for_hashing]
        The default number of features is 2**18.

    Loading 20 newsgroups training data
    3803 documents - 6.245MB

    DictVectorizer
    done in 1.034657s at 6.036MB/s
    Found 47928 unique terms

    FeatureHasher on frequency dicts
    done in 0.639051s at 9.772MB/s
    Found 43873 unique terms

    FeatureHasher on raw tokens
    done in 0.603110s at 10.354MB/s
    Found 43873 unique terms


|


.. code-block:: default


    # Author: Lars Buitinck
    # License: BSD 3 clause
    from collections import defaultdict
    import re
    import sys
    from time import time

    import numpy as np

    from sklearn.datasets import fetch_20newsgroups
    from sklearn.feature_extraction import DictVectorizer, FeatureHasher


    def n_nonzero_columns(X):
        """Returns the number of non-zero columns in a CSR matrix X."""
        return len(np.unique(X.nonzero()[1]))


    def tokens(doc):
        """Extract tokens from doc.

        This uses a simple regex to break strings into tokens. For a more
        principled approach, see CountVectorizer or TfidfVectorizer.
        """
        return (tok.lower() for tok in re.findall(r"\w+", doc))


    def token_freqs(doc):
        """Extract a dict mapping tokens from doc to their frequencies."""
        freq = defaultdict(int)
        for tok in tokens(doc):
            freq[tok] += 1
        return freq


    categories = [
        'alt.atheism',
        'comp.graphics',
        'comp.sys.ibm.pc.hardware',
        'misc.forsale',
        'rec.autos',
        'sci.space',
        'talk.religion.misc',
    ]
    # Uncomment the following line to use a larger set (11k+ documents)
    # categories = None

    print(__doc__)
    print("Usage: %s [n_features_for_hashing]" % sys.argv[0])
    print("    The default number of features is 2**18.")
    print()

    try:
        n_features = int(sys.argv[1])
    except IndexError:
        n_features = 2 ** 18
    except ValueError:
        print("not a valid number of features: %r" % sys.argv[1])
        sys.exit(1)


    print("Loading 20 newsgroups training data")
    raw_data, _ = fetch_20newsgroups(subset='train', categories=categories,
                                     return_X_y=True)
    data_size_mb = sum(len(s.encode('utf-8')) for s in raw_data) / 1e6
    print("%d documents - %0.3fMB" % (len(raw_data), data_size_mb))
    print()

    print("DictVectorizer")
    t0 = time()
    vectorizer = DictVectorizer()
    vectorizer.fit_transform(token_freqs(d) for d in raw_data)
    duration = time() - t0
    print("done in %fs at %0.3fMB/s" % (duration, data_size_mb / duration))
    print("Found %d unique terms" % len(vectorizer.get_feature_names()))
    print()

    print("FeatureHasher on frequency dicts")
    t0 = time()
    hasher = FeatureHasher(n_features=n_features)
    X = hasher.transform(token_freqs(d) for d in raw_data)
    duration = time() - t0
    print("done in %fs at %0.3fMB/s" % (duration, data_size_mb / duration))
    print("Found %d unique terms" % n_nonzero_columns(X))
    print()

    print("FeatureHasher on raw tokens")
    t0 = time()
    hasher = FeatureHasher(n_features=n_features, input_type="string")
    X = hasher.transform(tokens(d) for d in raw_data)
    duration = time() - t0
    print("done in %fs at %0.3fMB/s" % (duration, data_size_mb / duration))
    print("Found %d unique terms" % n_nonzero_columns(X))


.. rst-class:: sphx-glr-timing

   **Total running time of the script:** ( 0 minutes  2.546 seconds)


.. _sphx_glr_download_auto_examples_text_plot_hashing_vs_dict_vectorizer.py:


.. only :: html

 .. container:: sphx-glr-footer
    :class: sphx-glr-footer-example


  .. container:: binder-badge

    .. image:: https://mybinder.org/badge_logo.svg
      :target: https://mybinder.org/v2/gh/scikit-learn/scikit-learn/0.23.X?urlpath=lab/tree/notebooks/auto_examples/text/plot_hashing_vs_dict_vectorizer.ipynb
      :width: 150 px


  .. container:: sphx-glr-download sphx-glr-download-python

     :download:`Download Python source code: plot_hashing_vs_dict_vectorizer.py <plot_hashing_vs_dict_vectorizer.py>`


  .. container:: sphx-glr-download sphx-glr-download-jupyter

     :download:`Download Jupyter notebook: plot_hashing_vs_dict_vectorizer.ipynb <plot_hashing_vs_dict_vectorizer.ipynb>`


.. only:: html

 .. rst-class:: sphx-glr-signature

    `Gallery generated by Sphinx-Gallery <https://sphinx-gallery.github.io>`_