Modules

`main(verbose, covid, inp, out, csv, num, rec, unstructured, filters, codedict, topics, assign, cat, summary, sentiment, sentence, nlp, nnet, cls, knn, kmeans, cart, pca, regression, lstm, ml, visualize, ignore, include, outcome, source, sources, print_args, clear)` ¶

CRISP-T: Cross Industry Standard Process for Triangulation.

A comprehensive framework for analyzing textual and numerical data using advanced NLP, machine learning, and statistical techniques.

Source code in src/crisp_t/cli.py

@click.command()
@click.option("--verbose", "-v", is_flag=True, help="Print verbose messages.")
@click.option(
    "--covid", "-cf", default="", help="Download COVID narratives from the website"
)
@click.option("--inp", "-i", help="Load corpus from a folder containing corpus.json")
@click.option("--out", "-o", help="Write corpus to a folder as corpus.json")
@click.option("--csv", default="", help="CSV file name")
@click.option(
    "--num", "-n", default=3, help="N (clusters/epochs, etc, depending on context)"
)
@click.option("--rec", "-r", default=3, help="Record or top_n (based on context)")
@click.option(
    "--unstructured",
    "-t",
    multiple=True,
    help="Csv columns with text data that needs to be treated as text. (Ex. Free text comments)",
)
@click.option(
    "--filters",
    "-f",
    multiple=True,
    help="Filters to apply as key=value (can be used multiple times)",
)
@click.option("--codedict", is_flag=True, help="Generate coding dictionary")
@click.option("--topics", is_flag=True, help="Generate topic model")
@click.option("--assign", is_flag=True, help="Assign documents to topics")
@click.option(
    "--cat", is_flag=True, help="List categories of entire corpus or individual docs"
)
@click.option(
    "--summary",
    is_flag=True,
    help="Generate summary for entire corpus or individual docs",
)
@click.option(
    "--sentiment",
    is_flag=True,
    help="Generate sentiment score for entire corpus or individual docs",
)
@click.option(
    "--sentence",
    is_flag=True,
    default=False,
    help="Generate sentence-level scores when applicable",
)
@click.option("--nlp", is_flag=True, help="Generate all NLP reports")
@click.option("--ml", is_flag=True, help="Generate all ML reports")
@click.option("--nnet", is_flag=True, help="Display accuracy of a neural network model")
@click.option(
    "--cls",
    is_flag=True,
    help="Display confusion matrix from classifiers (SVM, Decision Tree)",
)
@click.option("--knn", is_flag=True, help="Display nearest neighbours")
@click.option("--kmeans", is_flag=True, help="Display KMeans clusters")
@click.option("--cart", is_flag=True, help="Display Association Rules")
@click.option("--pca", is_flag=True, help="Display PCA")
@click.option(
    "--regression", is_flag=True, help="Display linear or logistic regression results"
)
@click.option("--lstm", is_flag=True, help="Train LSTM model on text data to predict outcome variable")
@click.option("--visualize", is_flag=True, help="Visualize words, topics or wordcloud")
@click.option(
    "--ignore",
    default="",
    help="Comma separated ignore words or columns depending on context",
)
@click.option(
    "--include", default="", help="Comma separated columns to include from csv"
)
@click.option("--outcome", default="", help="Outcome variable for ML tasks")
@click.option("--source", "-s", help="Source URL or directory path to read data from")
@click.option("--print", "-p", "print_args", multiple=True, help="Display corpus information. Usage: --print documents --print 10, or quoted: --print 'documents 10'")
@click.option(
    "--sources",
    multiple=True,
    help="Multiple sources (URLs or directories) to read data from; can be used multiple times",
)
@click.option("--clear", is_flag=True, help="Clear cache before running analysis")
def main(
    verbose,
    covid,
    inp,
    out,
    csv,
    num,
    rec,
    unstructured,
    filters,
    codedict,
    topics,
    assign,
    cat,
    summary,
    sentiment,
    sentence,
    nlp,
    nnet,
    cls,
    knn,
    kmeans,
    cart,
    pca,
    regression,
    lstm,
    ml,
    visualize,
    ignore,
    include,
    outcome,
    source,
    sources,
    print_args,
    clear,
):
    """CRISP-T: Cross Industry Standard Process for Triangulation.

    A comprehensive framework for analyzing textual and numerical data using
    advanced NLP, machine learning, and statistical techniques.
    """

    if verbose:
        logging.getLogger().setLevel(logging.DEBUG)
        click.echo("Verbose mode enabled")

    click.echo("_________________________________________")
    click.echo("CRISP-T: Qualitative Research Analysis Framework")
    click.echo(f"Version: {__version__}")
    click.echo("_________________________________________")

    # Initialize components
    read_data = ReadData()
    corpus = None
    text_analyzer = None
    csv_analyzer = None
    ml_analyzer = None

    if clear:
        _clear_cache()

    try:
        # Handle COVID data download
        if covid:
            if not source:
                raise click.ClickException(
                    "--source (output folder) is required when using --covid."
                )
            click.echo(f"Downloading COVID narratives from: {covid} to {source}")
            try:
                from .utils import QRUtils

                QRUtils.read_covid_narratives(source, covid)
                click.echo(f"✓ COVID narratives downloaded to {source}")
            except Exception as e:
                raise click.ClickException(f"COVID download failed: {e}")

        # Build corpus using helpers (source preferred over inp)
        # if not source or inp, use default folders or env vars
        try:
            text_cols = ",".join(unstructured) if unstructured else ""
            corpus = initialize_corpus(
                source=source,
                inp=inp,
                comma_separated_text_columns=text_cols,
                comma_separated_ignore_words=(ignore if ignore else None),
            )
            # If filters were provided with ':' while using --source, emit guidance message
            if source and filters:
                if any(":" in flt and "=" not in flt for flt in filters):
                    click.echo("Filters are not supported when using --source")
        except click.ClickException:
            raise
        except Exception as e:
            click.echo(f"✗ Error initializing corpus: {e}", err=True)
            logger.error(f"Failed to initialize corpus: {e}")
            return

        # Handle multiple sources (unchanged behavior, but no filters applied here)
        if sources and not corpus:
            loaded_any = False
            for src in sources:
                click.echo(f"Reading data from source: {src}")
                try:
                    read_data.read_source(
                        src, comma_separated_ignore_words=ignore if ignore else None
                    )
                    loaded_any = True
                except Exception as e:
                    logger.error(f"Failed to read source {src}: {e}")
                    raise click.ClickException(str(e))

            if loaded_any:
                corpus = read_data.create_corpus(
                    name="Corpus from multiple sources",
                    description=f"Data loaded from {len(sources)} sources",
                )
                click.echo(
                    f"✓ Successfully loaded {len(corpus.documents)} document(s) from {len(sources)} sources"
                )
                # Filters are not applied for --sources in bulk mode

        # Load csv from corpus.df if available via helper
        if corpus and getattr(corpus, "df", None) is not None:
            try:
                text_cols = ",".join(unstructured) if unstructured else ""
                csv_analyzer = get_csv_analyzer(
                    corpus,
                    comma_separated_unstructured_text_columns=text_cols,
                    comma_separated_ignore_columns=(ignore if ignore else ""),
                    filters=filters,
                )
            except Exception as e:
                click.echo(f"✗ Error preparing CSV analyzer: {e}", err=True)
                logger.error(f"Failed to create CSV analyzer: {e}")
                return

        # Load CSV data (deprecated)
        if csv:
            click.echo(
                "--csv option has been deprecated. Put csv file in --source folder instead."
            )

        # Initialize ML analyzer if available and ML functions are requested
        if (
            ML_AVAILABLE
            and (nnet or cls or knn or kmeans or cart or pca or regression or lstm or ml)
            and csv_analyzer
        ):
            if include:
                csv_analyzer.comma_separated_include_columns(include)
            ml_analyzer = ML(csv=csv_analyzer)  # type: ignore
        else:
            if (nnet or cls or knn or kmeans or cart or pca or regression or lstm or ml) and not ML_AVAILABLE:
                click.echo("Machine learning features require additional dependencies.")
                click.echo("Install with: pip install crisp-t[ml]")
            if (nnet or cls or knn or kmeans or cart or pca or regression or lstm or ml) and not csv_analyzer:
                click.echo(
                    "ML analysis requires CSV data. Use --csv to provide a data file."
                )

        # Initialize Text analyzer and apply filters using helper if we have a corpus
        if corpus and not text_analyzer:
            text_analyzer = get_text_analyzer(corpus, filters=filters)

        # Ensure we have data to work with
        if not corpus and not csv_analyzer:
            click.echo(
                "No input data provided. Use --inp for text files"
            )
            return

        # Text Analysis Operations
        if text_analyzer:
            if nlp or codedict:
                click.echo("\n=== Generating Coding Dictionary ===")
                click.echo(
                    """
                Coding Dictionary Format:
                - CATEGORY: Common verbs representing main actions or themes.
                - PROPERTY: Common nouns associated with each CATEGORY.
                - DIMENSION: Common adjectives, adverbs, or verbs associated with each PROPERTY.

                Hint:   Use --ignore with a comma-separated list of words to exclude common but uninformative words.
                        Use --filters to narrow down documents based on metadata.
                        Use --num to adjust the number of categories displayed.
                        Use --rec to adjust the number of top items displayed per section.
                """
                )
                try:
                    text_analyzer.make_spacy_doc()
                    coding_dict = text_analyzer.print_coding_dictionary(
                        num=num, top_n=rec
                    )
                    if out:
                        _save_output(coding_dict, out, "coding_dictionary")
                except Exception as e:
                    click.echo(f"Error generating coding dictionary: {e}")

            if nlp or topics:
                click.echo("\n=== Topic Modeling ===")
                click.echo(
                    """
                Topic Modeling Output Format:
                Each topic is represented as a list of words with associated weights indicating their importance within the topic.
                Example:
                Topic 0: 0.116*"category" + 0.093*"comparison" + 0.070*"incident" + ...
                Hint:   Use --num to adjust the number of topics generated.
                        Use --filters to narrow down documents based on metadata.
                        Use --rec to adjust the number of words displayed per topic.
                """
                )
                try:
                    cluster_analyzer = Cluster(corpus=corpus)
                    cluster_analyzer.build_lda_model(topics=num)
                    topics_result = cluster_analyzer.print_topics(num_words=rec)
                    click.echo(
                        f"Generated {len(topics_result)} topics as above with the weights in brackets."
                    )
                    if out:
                        _save_output(topics_result, out, "topics")
                except Exception as e:
                    click.echo(f"Error generating topics: {e}")

            if nlp or assign:
                click.echo("\n=== Document-Topic Assignments ===")
                click.echo(
                    """
                Document-Topic Assignment Format:
                Each document is assigned to the topic it is most associated with, along with the contribution percentage.
                Hint: --visualize adds a DataFrame to corpus.visualization["assign_topics"] for visualization.
                """
                )
                try:
                    if "cluster_analyzer" not in locals():
                        cluster_analyzer = Cluster(corpus=corpus)
                        cluster_analyzer.build_lda_model(topics=num)
                    assignments = cluster_analyzer.format_topics_sentences(
                        visualize=visualize
                    )
                    document_assignments = cluster_analyzer.print_clusters()
                    click.echo(f"Assigned {len(assignments)} documents to topics")
                    if out:
                        _save_output(assignments, out, "topic_assignments")
                except Exception as e:
                    click.echo(f"Error assigning topics: {e}")

            if nlp or cat:
                click.echo("\n=== Category Analysis ===")
                click.echo(
                    """
                Category Analysis Output Format:
                           A list of common concepts or themes in "bag_of_terms" with corresponding weights.
                Hint:   Use --num to adjust the number of categories displayed.
                        Use --filters to narrow down documents based on metadata.
                """
                )
                try:
                    text_analyzer.make_spacy_doc()
                    categories = text_analyzer.print_categories(num=num)
                    if out:
                        _save_output(categories, out, "categories")
                except Exception as e:
                    click.echo(f"Error generating categories: {e}")

            if nlp or summary:
                click.echo("\n=== Text Summarization ===")
                click.echo(
                    """
                Text Summarization Output Format: A list of important sentences representing the main points of the text.
                Hint:   Use --num to adjust the number of sentences in the summary.
                        Use --filters to narrow down documents based on metadata.
                """
                )
                try:
                    text_analyzer.make_spacy_doc()
                    summary_result = text_analyzer.generate_summary(weight=num)
                    click.echo(summary_result)
                    if out:
                        _save_output(summary_result, out, "summary")
                except Exception as e:
                    click.echo(f"Error generating summary: {e}")

            if nlp or sentiment:
                click.echo("\n=== Sentiment Analysis ===")
                click.echo(
                    """
                Sentiment Analysis Output Format:
                           neg, neu, pos, compound scores.
                Hint:   Use --filters to narrow down documents based on metadata.
                        Use --sentence to get document-level sentiment scores.
                """
                )
                try:
                    sentiment_analyzer = Sentiment(corpus=corpus)  # type: ignore
                    sentiment_results = sentiment_analyzer.get_sentiment(
                        documents=sentence, verbose=verbose
                    )
                    click.echo(sentiment_results)
                    if out:
                        _save_output(sentiment_results, out, "sentiment")
                except Exception as e:
                    click.echo(f"Error generating sentiment analysis: {e}")

        # Machine Learning Operations
        if ml_analyzer and ML_AVAILABLE:
            target_col = outcome

            if kmeans or ml:
                click.echo("\n=== K-Means Clustering ===")
                click.echo(
                    """
                           K-Means clustering removes non-numeric columns.
                           Additionally it removes NaN values.
                           So combining with other ML options may not work as expected.
                Hint:   Use --num to adjust the number of clusters generated.
                """
                )
                csv_analyzer.retain_numeric_columns_only()
                csv_analyzer.drop_na()
                _ml_analyzer = ML(csv=csv_analyzer)
                clusters, members = _ml_analyzer.get_kmeans(
                    number_of_clusters=num, verbose=verbose
                )
                _ml_analyzer.profile(members, number_of_clusters=num)
                if out:
                    _save_output(
                        {"clusters": clusters, "members": members}, out, "kmeans"
                    )

            if (cls or ml) and target_col:
                click.echo("\n=== Classifier Evaluation ===")
                click.echo(
                    """
                           Classifier
                            - SVM: Support Vector Machine classifier with confusion matrix output.
                            - Decision Tree: Decision Tree classifier with feature importance output.
                Hint:   Use --outcome to specify the target variable for classification.
                        Use --rec to adjust the number of top important features displayed.
                        Use --include to specify columns to include in the analysis (comma separated).
                """
                )
                if not target_col:
                    raise click.ClickException(
                        "--outcome is required for classification tasks"
                    )
                click.echo("\n=== SVM ===")
                try:
                    confusion_matrix = ml_analyzer.svm_confusion_matrix(
                        y=target_col, test_size=0.25
                    )
                    click.echo(
                        ml_analyzer.format_confusion_matrix_to_human_readable(
                            confusion_matrix
                        )
                    )
                    if out:
                        _save_output(confusion_matrix, out, "svm_results")
                except Exception as e:
                    click.echo(f"Error performing SVM classification: {e}")
                click.echo("\n=== Decision Tree Classification ===")
                try:
                    cm, importance = ml_analyzer.get_decision_tree_classes(
                        y=target_col, top_n=rec
                    )
                    click.echo("\n=== Feature Importance ===")
                    click.echo(
                        ml_analyzer.format_confusion_matrix_to_human_readable(cm)
                    )
                    if out:
                        _save_output(cm, out, "decision_tree_results")
                except Exception as e:
                    click.echo(f"Error performing Decision Tree classification: {e}")

            if (nnet or ml) and target_col:
                click.echo("\n=== Neural Network Classification Accuracy ===")
                click.echo(
                    """
                            Neural Network classifier with accuracy output.
                Hint:   Use --outcome to specify the target variable for classification.
                        Use --include to specify columns to include in the analysis (comma separated).
                """
                )
                if not target_col:
                    raise click.ClickException(
                        "--outcome is required for neural network tasks"
                    )
                try:
                    predictions = ml_analyzer.get_nnet_predictions(y=target_col)
                    if out:
                        _save_output(predictions, out, "nnet_results")
                except Exception as e:
                    click.echo(f"Error performing Neural Network classification: {e}")

            if (knn or ml) and target_col:
                click.echo("\n=== K-Nearest Neighbors ===")
                click.echo(
                    """
                           K-Nearest Neighbors search results.
                Hint:   Use --outcome to specify the target variable for KNN search.
                        Use --rec to specify the record number to search from (1-based index).
                        Use --num to specify the number of nearest neighbors to retrieve.
                        Use --include to specify columns to include in the analysis (comma separated).
                """
                )
                if not target_col:
                    raise click.ClickException(
                        "--outcome is required for KNN search tasks"
                    )
                if rec < 1:
                    raise click.ClickException(
                        "--rec must be a positive integer (1-based index)"
                    )
                try:
                    knn_results = ml_analyzer.knn_search(y=target_col, n=num, r=rec)
                    if out:
                        _save_output(knn_results, out, "knn_results")
                except Exception as e:
                    click.echo(f"Error performing K-Nearest Neighbors search: {e}")

            if (cart or ml) and target_col:
                click.echo("\n=== Association Rules (CART) ===")
                click.echo(
                    """
                           Association Rules using the Apriori algorithm.
                Hint:   Use --outcome to specify the target variable to remove from features.
                        Use --num to specify the minimum support (between 1 and 99).
                        Use --rec to specify the minimum threshold for the rules (between 1 and 99).
                        Use --include to specify columns to include in the analysis (comma separated).
                """
                )
                if not target_col:
                    raise click.ClickException(
                        "--outcome is required for association rules tasks"
                    )
                if not (1 <= num <= 99):
                    raise click.ClickException(
                        "--num must be between 1 and 99 for min_support"
                    )
                if not (1 <= rec <= 99):
                    raise click.ClickException(
                        "--rec must be between 1 and 99 for min_threshold"
                    )
                _min_support = float(num / 100)
                _min_threshold = float(rec / 100)
                click.echo(
                    f"Using min_support={_min_support:.2f} and min_threshold={_min_threshold:.2f}"
                )
                try:
                    apriori_results = ml_analyzer.get_apriori(
                        y=target_col,
                        min_support=_min_support,
                        min_threshold=_min_threshold,
                    )
                    click.echo(apriori_results)
                    if out:
                        _save_output(apriori_results, out, "association_rules")
                except Exception as e:
                    click.echo(f"Error generating association rules: {e}")

            if (pca or ml) and target_col:
                click.echo("\n=== Principal Component Analysis ===")
                click.echo(
                    """
                           Principal Component Analysis (PCA) results.
                Hint:   Use --outcome to specify the target variable to remove from features.
                        Use --num to specify the number of principal components to generate.
                        Use --include to specify columns to include in the analysis (comma separated).
                """
                )
                try:
                    pca_results = ml_analyzer.get_pca(y=target_col, n=num)
                    if out:
                        _save_output(pca_results, out, "pca_results")
                except Exception as e:
                    click.echo(f"Error performing Principal Component Analysis: {e}")

            if (regression or ml) and target_col:
                click.echo("\n=== Regression Analysis ===")
                click.echo(
                    """
                           Regression Analysis (Linear or Logistic Regression).
                           Automatically detects binary outcomes for logistic regression.
                           Otherwise uses linear regression for continuous outcomes.
                Hint:   Use --outcome to specify the target variable for regression.
                        Use --include to specify columns to include in the analysis (comma separated).
                """
                )
                try:
                    regression_results = ml_analyzer.get_regression(y=target_col)
                    if out:
                        _save_output(regression_results, out, "regression_results")
                except Exception as e:
                    click.echo(f"Error performing regression analysis: {e}")

            if (lstm or ml) and target_col:
                click.echo("\n=== LSTM Text Classification ===")
                click.echo(
                    """
                           LSTM (Long Short-Term Memory) model for text-based prediction.
                           Tests if text documents converge towards predicting the outcome variable.
                           Requires both text documents and an 'id' column to align texts with outcome.
                Hint:   Use --outcome to specify the target variable for LSTM prediction.
                        The outcome should be binary (two classes).
                        Ensure documents have IDs matching the 'id' column in your data.
                """
                )
                if not target_col:
                    raise click.ClickException(
                        "--outcome is required for LSTM prediction tasks"
                    )
                try:
                    lstm_results = ml_analyzer.get_lstm_predictions(y=target_col)
                    if out:
                        _save_output(lstm_results, out, "lstm_results")
                except Exception as e:
                    click.echo(f"Error performing LSTM prediction: {e}")

        elif (nnet or cls or knn or kmeans or cart or pca or regression or lstm or ml) and not ML_AVAILABLE:
            click.echo("Machine learning features require additional dependencies.")
            click.echo("Install with: pip install crisp-t[ml]")

        # Save corpus and csv if output path is specified
        if out and corpus:
            if filters and inp and out and inp == out:
                raise click.ClickException(
                    "--out cannot be the same as --inp when using --filters. Please specify a different output folder to avoid overwriting input data."
                )
            if filters and ((not inp) or (not out)):
                raise click.ClickException(
                    "Both --inp and --out must be specified when using --filters."
                )
            output_path = pathlib.Path(out)
            # Allow both directory and a file path '.../corpus.json'
            if output_path.suffix:
                # Ensure parent exists
                output_path.parent.mkdir(parents=True, exist_ok=True)
                save_base = output_path
            else:
                output_path.mkdir(parents=True, exist_ok=True)
                save_base = output_path / "corpus.json"
            read_data.write_corpus_to_json(str(save_base), corpus=corpus)
            click.echo(f"✓ Corpus and csv saved to {save_base}")

        if print_args and corpus:
            click.echo("\n=== Corpus Details ===")
            # Join the print arguments into a single string
            print_command = " ".join(print_args) if print_args else None
            if print_command:
                click.echo(corpus.pretty_print(show=print_command))

        click.echo("\n=== Analysis Complete ===")

    except click.ClickException:
        # Let Click handle and set non-zero exit code
        raise
    except Exception as e:
        # Convert unexpected exceptions to ClickException for non-zero exit code
        if verbose:
            import traceback

            traceback.print_exc()
        raise click.ClickException(str(e))

This file is part of crisp-t.

crisp-t is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version.

crisp-t is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.

You should have received a copy of the GNU General Public License along with crisp-t. If not, see https://www.gnu.org/licenses/.

`ReadData` ¶

Source code in src/crisp_t/read_data.py

class ReadData:

    def __init__(self, corpus: Corpus | None = None, source=None):
        self._corpus = corpus
        self._source = source
        self._documents = []
        self._df = pd.DataFrame()

    @property
    def corpus(self):
        """
        Get the corpus.
        """
        if not self._corpus:
            raise ValueError("No corpus found. Please create a corpus first.")
        self._corpus.documents = self._documents
        self._corpus.df = self._df
        return self._corpus

    @property
    def documents(self):
        """
        Get the documents.
        """
        if not self._documents:
            raise ValueError("No documents found. Please read data first.")
        return self._documents

    @property
    def df(self):
        """
        Get the dataframe.
        """
        if self._df is None:
            raise ValueError("No dataframe found. Please read data first.")
        return self._df

    @corpus.setter
    def corpus(self, value):
        """
        Set the corpus.
        """
        if not isinstance(value, Corpus):
            raise ValueError("Value must be a Corpus object.")
        self._corpus = value

    @documents.setter
    def documents(self, value):
        """
        Set the documents.
        """
        if not isinstance(value, list):
            raise ValueError("Value must be a list of Document objects.")
        for document in value:
            if not isinstance(document, Document):
                raise ValueError("Value must be a list of Document objects.")
        self._documents = value

    @df.setter
    def df(self, value):
        """
        Set the dataframe.
        """
        if not isinstance(value, pd.DataFrame):
            raise ValueError("Value must be a pandas DataFrame.")
        self._df = value

    def pretty_print(self):
        """
        Pretty print the corpus.
        """
        if not self._corpus:
            self.create_corpus()
        if self._corpus:
            print(
                self._corpus.model_dump_json(indent=4, exclude={"df", "visualization"})
            )
            logger.info(
                "Corpus: %s",
                self._corpus.model_dump_json(indent=4, exclude={"df", "visualization"}),
            )
        else:
            logger.error("No corpus available to pretty print.")

    # TODO: Enforce only one corpus (Singleton pattern)
    def create_corpus(self, name=None, description=None):
        """
        Create a corpus from the documents and dataframe.
        """
        if not self._documents:
            raise ValueError("No documents found. Please read data first.")
        if self._corpus:
            self._corpus.documents = self._documents
            self._corpus.df = self._df
        else:
            timestamp = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
            corpus_id = f"corpus_{timestamp}"
            self._corpus = Corpus(
                documents=self._documents,
                df=self._df,
                visualization={},
                metadata={},
                id=corpus_id,
                score=0.0,
                name=name,
                description=description,
            )
        return self._corpus

    def get_documents_from_corpus(self):
        """
        Get the documents from the corpus.
        """
        if not self._corpus:
            raise ValueError("No corpus found. Please create a corpus first.")
        return self._corpus.documents

    def get_document_by_id(self, doc_id):
        """
        Get a document from the corpus by its ID. Uses parallel search for large corpora.
        """
        if not self._corpus:
            raise ValueError("No corpus found. Please create a corpus first.")
        documents = self._corpus.documents
        if len(documents) < 10:
            for document in tqdm(documents, desc="Searching documents", disable=True):
                if document.id == doc_id:
                    return document
        else:
            n_cores = multiprocessing.cpu_count()
            with ThreadPoolExecutor() as executor:
                futures = {
                    executor.submit(lambda doc: doc.id == doc_id, document): i
                    for i, document in enumerate(documents)
                }
                with tqdm(
                    total=len(futures),
                    desc=f"Searching documents (parallel, {n_cores} cores)",
                ) as pbar:
                    for future in as_completed(futures):
                        i = futures[future]
                        found = future.result()
                        pbar.update(1)
                        if found:
                            return documents[i]
        raise ValueError("Document not found: %s" % doc_id)

    def write_corpus_to_json(self, file_path="", corpus=None):
        """
        Write the corpus to a json file.

        Accepts either a directory path or an explicit file path ending with
        'corpus.json'. In both cases, a sibling 'corpus_df.csv' will be written
        next to the json if a DataFrame is available.
        """
        from pathlib import Path

        path = Path(file_path)
        # Determine targets
        if path.suffix:  # treat as explicit file path
            file_name = path
            df_name = path.with_name("corpus_df.csv")
        else:
            file_name = path / "corpus.json"
            df_name = path / "corpus_df.csv"

        corp = corpus if corpus is not None else self._corpus
        if not corp:
            raise ValueError("No corpus found. Please create a corpus first.")
        file_name.parent.mkdir(parents=True, exist_ok=True)
        with open(file_name, "w") as f:
            json.dump(corp.model_dump(exclude={"df", "visualization"}), f, indent=4)
        if corp.df is not None and isinstance(corp.df, pd.DataFrame):
            if not corp.df.empty:
                corp.df.to_csv(df_name, index=False)
        logger.info("Corpus written to %s", file_name)

    # @lru_cache(maxsize=3)
    def read_corpus_from_json(self, file_path="", comma_separated_ignore_words=""):
        """
        Read the corpus from a json file. Parallelizes ignore word removal for large corpora.
        """
        from pathlib import Path

        file_path = Path(file_path)
        file_name = file_path / "corpus.json"
        df_name = file_path / "corpus_df.csv"
        if self._source:
            file_name = Path(self._source) / file_name
        if not file_name.exists():
            raise ValueError(f"File not found: {file_name}")
        with open(file_name, "r") as f:
            data = json.load(f)
            self._corpus = Corpus.model_validate(data)
            logger.info(f"Corpus read from {file_name}")
        if df_name.exists():
            self._corpus.df = pd.read_csv(df_name)
        else:
            self._corpus.df = None
        # Remove ignore words from self._corpus.documents text
        documents = self._corpus.documents

        # Pre-compile regex patterns once for efficiency instead of inside loops
        compiled_patterns = []
        if comma_separated_ignore_words:
            for word in comma_separated_ignore_words.split(","):
                pattern = re.compile(r"\b" + word.strip() + r"\b", flags=re.IGNORECASE)
                compiled_patterns.append(pattern)

        if len(documents) < 10:
            processed_docs = []
            for document in tqdm(documents, desc="Processing documents", disable=True):
                for pattern in compiled_patterns:
                    document.text = pattern.sub("", document.text)
                processed_docs.append(document)
        else:

            def process_doc(document):
                for pattern in compiled_patterns:
                    document.text = pattern.sub("", document.text)
                return document

            processed_docs = []
            n_cores = multiprocessing.cpu_count()
            with ThreadPoolExecutor() as executor:
                futures = {
                    executor.submit(process_doc, document): document
                    for document in documents
                }
                with tqdm(
                    total=len(futures),
                    desc=f"Processing documents (parallel, {n_cores} cores)",
                ) as pbar:
                    for future in as_completed(futures):
                        processed_docs.append(future.result())
                        pbar.update(1)
        self._corpus.documents = processed_docs
        return self._corpus

    # @lru_cache(maxsize=3)
    def read_csv_to_corpus(
        self,
        file_name,
        comma_separated_ignore_words=None,
        comma_separated_text_columns="",
        id_column="",
    ):
        """
        Read the corpus from a csv file. Parallelizes document creation for large CSVs.
        """
        from pathlib import Path

        file_name = Path(file_name)
        if not file_name.exists():
            raise ValueError(f"File not found: {file_name}")
        df = pd.read_csv(file_name)
        original_df = df.copy()
        if comma_separated_text_columns:
            text_columns = comma_separated_text_columns.split(",")
        else:
            text_columns = []
        # remove text columns from the dataframe
        for column in text_columns:
            if column in df.columns:
                df.drop(column, axis=1, inplace=True)
        # Set self._df to the numeric part after dropping text columns
        self._df = df.copy()
        rows = list(original_df.iterrows())

        # Pre-compile regex patterns once for efficiency instead of inside loops
        compiled_patterns = []
        if comma_separated_ignore_words:
            for word in comma_separated_ignore_words.split(","):
                pattern = re.compile(r"\b" + word.strip() + r"\b", flags=re.IGNORECASE)
                compiled_patterns.append(pattern)

        def create_document(args):
            index, row = args
            # Use list and join for efficient string concatenation, handle None values
            text_parts = [str(row[column]) if row[column] is not None and not (isinstance(row[column], float) and row[column] != row[column]) else '' for column in text_columns]
            read_from_file = " ".join(text_parts)
            # Apply pre-compiled patterns
            for pattern in compiled_patterns:
                read_from_file = pattern.sub("", read_from_file)
            _document = Document(
                text=read_from_file,
                metadata={
                    "source": str(file_name),
                    "file_name": str(file_name),
                    "row": index,
                    "id": (
                        row[id_column]
                        if (id_column != "" and id_column in original_df.columns)
                        else index
                    ),
                },
                id=str(index),
                score=0.0,
                name="",
                description="",
            )
            return read_from_file, _document

        if len(rows) < 10:
            results = [
                create_document(args)
                for args in tqdm(rows, desc="Reading CSV rows", disable=True)
            ]
        else:

            results = []
            # import multiprocessing

            n_cores = multiprocessing.cpu_count()
            with ThreadPoolExecutor() as executor:
                futures = {
                    executor.submit(create_document, args): args for args in rows
                }
                with tqdm(
                    total=len(futures),
                    desc=f"Reading CSV rows (parallel, {n_cores} cores)",
                ) as pbar:
                    for future in as_completed(futures):
                        results.append(future.result())
                        pbar.update(1)

        if len(results) < 10:
            for read_from_file, _document in tqdm(
                results, desc="Finalizing corpus", disable=True
            ):
                self._documents.append(_document)
        else:

            # import multiprocessing

            n_cores = multiprocessing.cpu_count()
            with tqdm(
                results,
                total=len(results),
                desc=f"Finalizing corpus (parallel, {n_cores} cores)",
            ) as pbar:
                for read_from_file, _document in pbar:
                    self._documents.append(_document)
        logger.info(f"Corpus read from {file_name}")
        self.create_corpus()
        return self._corpus

    def read_source(
        self, source, comma_separated_ignore_words=None, comma_separated_text_columns=""
    ):
        _CSV_EXISTS = False

        # Pre-compile regex patterns once for efficiency instead of inside loops
        compiled_patterns = []
        if comma_separated_ignore_words:
            for word in comma_separated_ignore_words.split(","):
                pattern = re.compile(r"\b" + word.strip() + r"\b", flags=re.IGNORECASE)
                compiled_patterns.append(pattern)

        def apply_ignore_patterns(text):
            """Apply pre-compiled ignore patterns to text."""
            for pattern in compiled_patterns:
                text = pattern.sub("", text)
            return text

        # if source is a url
        if source.startswith("http://") or source.startswith("https://"):
            response = requests.get(source)
            if response.status_code == 200:
                read_from_file = response.text
                read_from_file = apply_ignore_patterns(read_from_file)
                # self._content removed
                _document = Document(
                    text=read_from_file,
                    metadata={"source": source},
                    id=source,
                    score=0.0,
                    name="",
                    description="",
                )
                self._documents.append(_document)
        elif os.path.exists(source):
            source_path = Path(source)
            self._source = source
            logger.info(f"Reading data from folder: {source}")
            file_list = os.listdir(source)
            for file_name in tqdm(
                file_list, desc="Reading files", disable=len(file_list) < 10
            ):
                file_path = source_path / file_name
                if file_name.endswith(".txt"):
                    with open(file_path, "r") as f:
                        read_from_file = f.read()
                        read_from_file = apply_ignore_patterns(read_from_file)
                        # self._content removed
                        _document = Document(
                            text=read_from_file,
                            metadata={
                                "source": str(file_path),
                                "file_name": file_name,
                            },
                            id=file_name,
                            score=0.0,
                            name="",
                            description="",
                        )
                        self._documents.append(_document)
                if file_name.endswith(".pdf"):
                    with open(file_path, "rb") as f:
                        reader = PdfReader(f)
                        # Use list and join for efficient string concatenation
                        page_texts = []
                        for page in tqdm(
                            reader.pages,
                            desc=f"Reading PDF {file_name}",
                            leave=False,
                            disable=len(reader.pages) < 10,
                        ):
                            page_texts.append(page.extract_text())
                        read_from_file = "".join(page_texts)
                        read_from_file = apply_ignore_patterns(read_from_file)
                        # self._content removed
                        _document = Document(
                            text=read_from_file,
                            metadata={
                                "source": str(file_path),
                                "file_name": file_name,
                            },
                            id=file_name,
                            score=0.0,
                            name="",
                            description="",
                        )
                        self._documents.append(_document)
                if file_name.endswith(".csv") and comma_separated_text_columns == "":
                    logger.info(f"Reading CSV file: {file_path}")
                    self._df = Csv().read_csv(file_path)
                    logger.info(f"CSV file read with shape: {self._df.shape}")
                    _CSV_EXISTS = True
                if file_name.endswith(".csv") and comma_separated_text_columns != "":
                    logger.info(f"Reading CSV file to corpus: {file_path}")
                    self.read_csv_to_corpus(
                        file_path,
                        comma_separated_ignore_words,
                        comma_separated_text_columns,
                    )
                    logger.info(
                        f"CSV file read to corpus with documents: {len(self._documents)}"
                    )
                    _CSV_EXISTS = True
            if not _CSV_EXISTS:
                # create a simple csv with columns: id, number, text
                # and fill it with random data
                _csv = """
id,number,response
1,100,Sample text one
2,200,Sample text two
3,300,Sample text three
4,400,Sample text four
"""
                # write the csv to a temp file
                with tempfile.NamedTemporaryFile(
                    mode="w+", delete=False, suffix=".csv"
                ) as temp_csv:
                    temp_csv.write(_csv)
                    temp_csv_path = temp_csv.name
                logger.info(f"No CSV found. Created temp CSV file: {temp_csv_path}")
                self._df = Csv().read_csv(temp_csv_path)
                logger.info(f"CSV file read with shape: {self._df.shape}")
                # remove the temp file
                os.remove(temp_csv_path)

        else:
            raise ValueError(f"Source not found: {source}")

    def corpus_as_dataframe(self):
        """
        Convert the corpus to a pandas dataframe. Parallelizes for large corpora.
        """
        if not self._corpus:
            raise ValueError("No corpus found. Please create a corpus first.")
        documents = self._corpus.documents
        if len(documents) < 10:
            data = [
                document.model_dump()
                for document in tqdm(
                    documents, desc="Converting to dataframe", disable=True
                )
            ]
        else:
            data = []

            def dump_doc(document):
                return document.model_dump()

            n_cores = multiprocessing.cpu_count()
            with ThreadPoolExecutor() as executor:
                futures = {
                    executor.submit(dump_doc, document): document
                    for document in documents
                }
                with tqdm(
                    total=len(futures),
                    desc=f"Converting to dataframe (parallel, {n_cores} cores)",
                ) as pbar:
                    for future in as_completed(futures):
                        data.append(future.result())
                        pbar.update(1)
        df = pd.DataFrame(data)
        return df

`corpus` `property` `writable` ¶

Get the corpus.

`df` `property` `writable` ¶

Get the dataframe.

`documents` `property` `writable` ¶

Get the documents.

`corpus_as_dataframe()` ¶

Convert the corpus to a pandas dataframe. Parallelizes for large corpora.

Source code in src/crisp_t/read_data.py

def corpus_as_dataframe(self):
    """
    Convert the corpus to a pandas dataframe. Parallelizes for large corpora.
    """
    if not self._corpus:
        raise ValueError("No corpus found. Please create a corpus first.")
    documents = self._corpus.documents
    if len(documents) < 10:
        data = [
            document.model_dump()
            for document in tqdm(
                documents, desc="Converting to dataframe", disable=True
            )
        ]
    else:
        data = []

        def dump_doc(document):
            return document.model_dump()

        n_cores = multiprocessing.cpu_count()
        with ThreadPoolExecutor() as executor:
            futures = {
                executor.submit(dump_doc, document): document
                for document in documents
            }
            with tqdm(
                total=len(futures),
                desc=f"Converting to dataframe (parallel, {n_cores} cores)",
            ) as pbar:
                for future in as_completed(futures):
                    data.append(future.result())
                    pbar.update(1)
    df = pd.DataFrame(data)
    return df

`create_corpus(name=None, description=None)` ¶

Create a corpus from the documents and dataframe.

Source code in src/crisp_t/read_data.py

def create_corpus(self, name=None, description=None):
    """
    Create a corpus from the documents and dataframe.
    """
    if not self._documents:
        raise ValueError("No documents found. Please read data first.")
    if self._corpus:
        self._corpus.documents = self._documents
        self._corpus.df = self._df
    else:
        timestamp = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
        corpus_id = f"corpus_{timestamp}"
        self._corpus = Corpus(
            documents=self._documents,
            df=self._df,
            visualization={},
            metadata={},
            id=corpus_id,
            score=0.0,
            name=name,
            description=description,
        )
    return self._corpus

`get_document_by_id(doc_id)` ¶

Get a document from the corpus by its ID. Uses parallel search for large corpora.

Source code in src/crisp_t/read_data.py

def get_document_by_id(self, doc_id):
    """
    Get a document from the corpus by its ID. Uses parallel search for large corpora.
    """
    if not self._corpus:
        raise ValueError("No corpus found. Please create a corpus first.")
    documents = self._corpus.documents
    if len(documents) < 10:
        for document in tqdm(documents, desc="Searching documents", disable=True):
            if document.id == doc_id:
                return document
    else:
        n_cores = multiprocessing.cpu_count()
        with ThreadPoolExecutor() as executor:
            futures = {
                executor.submit(lambda doc: doc.id == doc_id, document): i
                for i, document in enumerate(documents)
            }
            with tqdm(
                total=len(futures),
                desc=f"Searching documents (parallel, {n_cores} cores)",
            ) as pbar:
                for future in as_completed(futures):
                    i = futures[future]
                    found = future.result()
                    pbar.update(1)
                    if found:
                        return documents[i]
    raise ValueError("Document not found: %s" % doc_id)

`get_documents_from_corpus()` ¶

Get the documents from the corpus.

Source code in src/crisp_t/read_data.py

def get_documents_from_corpus(self):
    """
    Get the documents from the corpus.
    """
    if not self._corpus:
        raise ValueError("No corpus found. Please create a corpus first.")
    return self._corpus.documents

`pretty_print()` ¶

Pretty print the corpus.

Source code in src/crisp_t/read_data.py

def pretty_print(self):
    """
    Pretty print the corpus.
    """
    if not self._corpus:
        self.create_corpus()
    if self._corpus:
        print(
            self._corpus.model_dump_json(indent=4, exclude={"df", "visualization"})
        )
        logger.info(
            "Corpus: %s",
            self._corpus.model_dump_json(indent=4, exclude={"df", "visualization"}),
        )
    else:
        logger.error("No corpus available to pretty print.")

`read_corpus_from_json(file_path='', comma_separated_ignore_words='')` ¶

Read the corpus from a json file. Parallelizes ignore word removal for large corpora.

Source code in src/crisp_t/read_data.py

def read_corpus_from_json(self, file_path="", comma_separated_ignore_words=""):
    """
    Read the corpus from a json file. Parallelizes ignore word removal for large corpora.
    """
    from pathlib import Path

    file_path = Path(file_path)
    file_name = file_path / "corpus.json"
    df_name = file_path / "corpus_df.csv"
    if self._source:
        file_name = Path(self._source) / file_name
    if not file_name.exists():
        raise ValueError(f"File not found: {file_name}")
    with open(file_name, "r") as f:
        data = json.load(f)
        self._corpus = Corpus.model_validate(data)
        logger.info(f"Corpus read from {file_name}")
    if df_name.exists():
        self._corpus.df = pd.read_csv(df_name)
    else:
        self._corpus.df = None
    # Remove ignore words from self._corpus.documents text
    documents = self._corpus.documents

    # Pre-compile regex patterns once for efficiency instead of inside loops
    compiled_patterns = []
    if comma_separated_ignore_words:
        for word in comma_separated_ignore_words.split(","):
            pattern = re.compile(r"\b" + word.strip() + r"\b", flags=re.IGNORECASE)
            compiled_patterns.append(pattern)

    if len(documents) < 10:
        processed_docs = []
        for document in tqdm(documents, desc="Processing documents", disable=True):
            for pattern in compiled_patterns:
                document.text = pattern.sub("", document.text)
            processed_docs.append(document)
    else:

        def process_doc(document):
            for pattern in compiled_patterns:
                document.text = pattern.sub("", document.text)
            return document

        processed_docs = []
        n_cores = multiprocessing.cpu_count()
        with ThreadPoolExecutor() as executor:
            futures = {
                executor.submit(process_doc, document): document
                for document in documents
            }
            with tqdm(
                total=len(futures),
                desc=f"Processing documents (parallel, {n_cores} cores)",
            ) as pbar:
                for future in as_completed(futures):
                    processed_docs.append(future.result())
                    pbar.update(1)
    self._corpus.documents = processed_docs
    return self._corpus

`read_csv_to_corpus(file_name, comma_separated_ignore_words=None, comma_separated_text_columns='', id_column='')` ¶

Read the corpus from a csv file. Parallelizes document creation for large CSVs.

Source code in src/crisp_t/read_data.py

def read_csv_to_corpus(
    self,
    file_name,
    comma_separated_ignore_words=None,
    comma_separated_text_columns="",
    id_column="",
):
    """
    Read the corpus from a csv file. Parallelizes document creation for large CSVs.
    """
    from pathlib import Path

    file_name = Path(file_name)
    if not file_name.exists():
        raise ValueError(f"File not found: {file_name}")
    df = pd.read_csv(file_name)
    original_df = df.copy()
    if comma_separated_text_columns:
        text_columns = comma_separated_text_columns.split(",")
    else:
        text_columns = []
    # remove text columns from the dataframe
    for column in text_columns:
        if column in df.columns:
            df.drop(column, axis=1, inplace=True)
    # Set self._df to the numeric part after dropping text columns
    self._df = df.copy()
    rows = list(original_df.iterrows())

    # Pre-compile regex patterns once for efficiency instead of inside loops
    compiled_patterns = []
    if comma_separated_ignore_words:
        for word in comma_separated_ignore_words.split(","):
            pattern = re.compile(r"\b" + word.strip() + r"\b", flags=re.IGNORECASE)
            compiled_patterns.append(pattern)

    def create_document(args):
        index, row = args
        # Use list and join for efficient string concatenation, handle None values
        text_parts = [str(row[column]) if row[column] is not None and not (isinstance(row[column], float) and row[column] != row[column]) else '' for column in text_columns]
        read_from_file = " ".join(text_parts)
        # Apply pre-compiled patterns
        for pattern in compiled_patterns:
            read_from_file = pattern.sub("", read_from_file)
        _document = Document(
            text=read_from_file,
            metadata={
                "source": str(file_name),
                "file_name": str(file_name),
                "row": index,
                "id": (
                    row[id_column]
                    if (id_column != "" and id_column in original_df.columns)
                    else index
                ),
            },
            id=str(index),
            score=0.0,
            name="",
            description="",
        )
        return read_from_file, _document

    if len(rows) < 10:
        results = [
            create_document(args)
            for args in tqdm(rows, desc="Reading CSV rows", disable=True)
        ]
    else:

        results = []
        # import multiprocessing

        n_cores = multiprocessing.cpu_count()
        with ThreadPoolExecutor() as executor:
            futures = {
                executor.submit(create_document, args): args for args in rows
            }
            with tqdm(
                total=len(futures),
                desc=f"Reading CSV rows (parallel, {n_cores} cores)",
            ) as pbar:
                for future in as_completed(futures):
                    results.append(future.result())
                    pbar.update(1)

    if len(results) < 10:
        for read_from_file, _document in tqdm(
            results, desc="Finalizing corpus", disable=True
        ):
            self._documents.append(_document)
    else:

        # import multiprocessing

        n_cores = multiprocessing.cpu_count()
        with tqdm(
            results,
            total=len(results),
            desc=f"Finalizing corpus (parallel, {n_cores} cores)",
        ) as pbar:
            for read_from_file, _document in pbar:
                self._documents.append(_document)
    logger.info(f"Corpus read from {file_name}")
    self.create_corpus()
    return self._corpus

`write_corpus_to_json(file_path='', corpus=None)` ¶

Write the corpus to a json file.

Accepts either a directory path or an explicit file path ending with 'corpus.json'. In both cases, a sibling 'corpus_df.csv' will be written next to the json if a DataFrame is available.

Source code in src/crisp_t/read_data.py

def write_corpus_to_json(self, file_path="", corpus=None):
    """
    Write the corpus to a json file.

    Accepts either a directory path or an explicit file path ending with
    'corpus.json'. In both cases, a sibling 'corpus_df.csv' will be written
    next to the json if a DataFrame is available.
    """
    from pathlib import Path

    path = Path(file_path)
    # Determine targets
    if path.suffix:  # treat as explicit file path
        file_name = path
        df_name = path.with_name("corpus_df.csv")
    else:
        file_name = path / "corpus.json"
        df_name = path / "corpus_df.csv"

    corp = corpus if corpus is not None else self._corpus
    if not corp:
        raise ValueError("No corpus found. Please create a corpus first.")
    file_name.parent.mkdir(parents=True, exist_ok=True)
    with open(file_name, "w") as f:
        json.dump(corp.model_dump(exclude={"df", "visualization"}), f, indent=4)
    if corp.df is not None and isinstance(corp.df, pd.DataFrame):
        if not corp.df.empty:
            corp.df.to_csv(df_name, index=False)
    logger.info("Corpus written to %s", file_name)

`main(verbose, id, name, description, docs, remove_docs, metas, relationships, clear_rel, print_corpus, out, inp, df_cols, df_row_count, df_row, doc_ids, doc_id, print_relationships, relationships_for_keyword, semantic, similar_docs, num, semantic_chunks, rec, metadata_df, metadata_keys, tdabm, graph)` ¶

CRISP-T Corpus CLI: create and manipulate a corpus quickly from the command line.

Source code in src/crisp_t/corpuscli.py

@click.command()
@click.option("--verbose", "-v", is_flag=True, help="Print verbose messages.")
@click.option("--id", help="Unique identifier for the corpus.")
@click.option("--name", default=None, help="Name of the corpus.")
@click.option("--description", default=None, help="Description of the corpus.")
@click.option(
    "--doc",
    "docs",
    multiple=True,
    help=(
        "Add a document as 'id|name|text' (or 'id|text' if name omitted). "
        "Can be used multiple times."
    ),
)
@click.option(
    "--remove-doc",
    "remove_docs",
    multiple=True,
    help="Remove a document by its ID (can be used multiple times).",
)
@click.option(
    "--meta",
    "metas",
    multiple=True,
    help="Add or update corpus metadata as key=value (can be used multiple times).",
)
@click.option(
    "--add-rel",
    "relationships",
    multiple=True,
    help=(
        "Add a relationship as 'first|second|relation' (e.g., text:term|numb:col|correlates)."
    ),
)
@click.option(
    "--clear-rel",
    is_flag=True,
    help="Clear all relationships in the corpus metadata.",
)
@click.option("--print", "print_corpus", is_flag=True, help="Pretty print the corpus")
@click.option(
    "--out", default=None, help="Write corpus to a folder or file as corpus.json (save)"
)
@click.option(
    "--inp",
    default=None,
    help="Load corpus from a folder or file containing corpus.json (load)",
)
# New options for Corpus methods
@click.option("--df-cols", is_flag=True, help="Print all DataFrame column names.")
@click.option("--df-row-count", is_flag=True, help="Print number of rows in DataFrame.")
@click.option("--df-row", default=None, type=int, help="Print DataFrame row by index.")
@click.option("--doc-ids", is_flag=True, help="Print all document IDs in the corpus.")
@click.option("--doc-id", default=None, help="Print document by ID.")
@click.option(
    "--relationships",
    "print_relationships",
    is_flag=True,
    help="Print all relationships in the corpus.",
)
@click.option(
    "--relationships-for-keyword",
    default=None,
    help="Print all relationships involving a specific keyword.",
)
@click.option(
    "--semantic",
    default=None,
    help="Perform semantic search with the given query string. Returns similar documents.",
)
@click.option(
    "--similar-docs",
    default=None,
    help="Find documents similar to a comma-separated list of document IDs. Use with --num and --rec. Useful for literature reviews.",
)
@click.option(
    "--num",
    default=5,
    type=int,
    help="Number of results to return (default: 5). Used for semantic search and similar documents search.",
)
@click.option(
    "--semantic-chunks",
    default=None,
    help="Perform semantic search on document chunks. Returns matching chunks for a specific document. Use with --doc-id and --rec (threshold).",
)
@click.option(
    "--rec",
    default=0.4,
    type=float,
    help="Threshold for semantic search (0-1, default: 0.4). Only chunks with similarity above this value are returned.",
)
@click.option(
    "--metadata-df",
    is_flag=True,
    help="Export collection metadata as DataFrame. Requires semantic search to be initialized first.",
)
@click.option(
    "--metadata-keys",
    default=None,
    help="Comma-separated list of metadata keys to include in DataFrame export.",
)
@click.option(
    "--tdabm",
    default=None,
    help="Perform TDABM analysis. Format: 'y_variable:x_variables:radius' (e.g., 'satisfaction:age,income:0.3'). Radius defaults to 0.3 if omitted.",
)
@click.option(
    "--graph",
    is_flag=True,
    help="Generate graph representation of the corpus. Requires documents to have keywords assigned (run with --assign first).",
)
def main(
    verbose: bool,
    id: Optional[str],
    name: Optional[str],
    description: Optional[str],
    docs: tuple[str, ...],
    remove_docs: tuple[str, ...],
    metas: tuple[str, ...],
    relationships: tuple[str, ...],
    clear_rel: bool,
    print_corpus: bool,
    out: Optional[str],
    inp: Optional[str],
    df_cols: bool,
    df_row_count: bool,
    df_row: Optional[int],
    doc_ids: bool,
    doc_id: Optional[str],
    print_relationships: bool,
    relationships_for_keyword: Optional[str],
    semantic: Optional[str],
    similar_docs: Optional[str],
    num: int,
    semantic_chunks: Optional[str],
    rec: float,
    metadata_df: bool,
    metadata_keys: Optional[str],
    tdabm: Optional[str],
    graph: bool,
):
    """
    CRISP-T Corpus CLI: create and manipulate a corpus quickly from the command line.
    """
    logging.basicConfig(level=(logging.DEBUG if verbose else logging.WARNING))
    logger = logging.getLogger(__name__)

    if verbose:
        click.echo("Verbose mode enabled")

    click.echo("_________________________________________")
    click.echo("CRISP-T: Corpus CLI")
    click.echo("_________________________________________")

    # Load corpus from --inp if provided
    corpus = initialize_corpus(inp=inp)
    if not corpus:
        # Build initial corpus from CLI args
        if not id:
            raise click.ClickException("--id is required when not using --inp.")
        corpus = Corpus(
            id=id,
            name=name,
            description=description,
            score=None,
            documents=[],
            df=None,
            visualization={},
            metadata={},
        )

    # Add documents
    for d in docs:
        doc_id, doc_name, doc_text = _parse_doc(d)
        document = Document(
            id=doc_id,
            name=doc_name,
            description=None,
            score=0.0,
            text=doc_text,
            metadata={},
        )
        corpus.add_document(document)
    if docs:
        click.echo(f"✓ Added {len(docs)} document(s)")

    # Remove documents
    for rid in remove_docs:
        corpus.remove_document_by_id(rid)
    if remove_docs:
        click.echo(f"✓ Removed {len(remove_docs)} document(s)")

    # Update metadata
    for m in metas:
        k, v = _parse_kv(m)
        corpus.update_metadata(k, v)
    if metas:
        click.echo(f"✓ Updated metadata entries: {len(metas)}")

    # Relationships
    for r in relationships:
        first, second, relation = _parse_relationship(r)
        corpus.add_relationship(first, second, relation)
    if relationships:
        click.echo(f"✓ Added {len(relationships)} relationship(s)")
    if clear_rel:
        corpus.clear_relationships()
        click.echo("✓ Cleared relationships")

    # Print DataFrame column names
    if df_cols:
        cols = corpus.get_all_df_column_names()
        click.echo(f"DataFrame columns: {cols}")

    # Print DataFrame row count
    if df_row_count:
        count = corpus.get_row_count()
        click.echo(f"DataFrame row count: {count}")

    # Print DataFrame row by index
    if df_row is not None:
        row = corpus.get_row_by_index(df_row)
        if row is not None:
            click.echo(f"DataFrame row {df_row}: {row.to_dict()}")
        else:
            click.echo(f"No row at index {df_row}")

    # Print all document IDs
    if doc_ids:
        ids = corpus.get_all_document_ids()
        click.echo(f"Document IDs: {ids}")

    # Print document by ID
    if doc_id:
        doc = corpus.get_document_by_id(doc_id)
        if doc:
            click.echo(f"Document {doc_id}: {doc.model_dump()}")
        else:
            click.echo(f"No document found with ID {doc_id}")
            exit(0)

    # Print relationships
    if print_relationships:
        rels = corpus.get_relationships()
        click.echo(f"Relationships: {rels}")

    # Print relationships for keyword
    if relationships_for_keyword:
        rels = corpus.get_all_relationships_for_keyword(relationships_for_keyword)
        click.echo(f"Relationships for keyword '{relationships_for_keyword}': {rels}")

    # Semantic search
    if semantic:
        try:
            from .semantic import Semantic

            click.echo(f"\nPerforming semantic search for: '{semantic}'")
            # Try with default embeddings first, fall back to simple embeddings
            try:
                semantic_analyzer = Semantic(corpus)
            except Exception as network_error:
                # If network error or download fails, try simple embeddings
                if "address" in str(network_error).lower() or "download" in str(network_error).lower():
                    click.echo("Note: Using simple embeddings (network unavailable)")
                    semantic_analyzer = Semantic(corpus, use_simple_embeddings=True)
                else:
                    raise
            corpus = semantic_analyzer.get_similar(semantic, n_results=num)
            click.echo(f"✓ Found {len(corpus.documents)} similar documents")
            click.echo(
                f"Hint: Use --out to save the filtered corpus, or --print to view results"
            )
        except ImportError as e:
            click.echo(f"Error: {e}")
            click.echo("Install chromadb with: pip install chromadb")
        except Exception as e:
            click.echo(f"Error during semantic search: {e}")

    # Find similar documents
    if similar_docs:
        try:
            from .semantic import Semantic

            click.echo(f"\nFinding documents similar to: '{similar_docs}'")
            click.echo(f"Number of results: {num}")
            # Convert rec to 0-1 range if needed (for similar_docs, threshold is 0-1)
            threshold = rec / 10.0 if rec > 1.0 else rec
            click.echo(f"Similarity threshold: {threshold}")

            # Try with default embeddings first, fall back to simple embeddings
            try:
                semantic_analyzer = Semantic(corpus)
            except Exception as network_error:
                # If network error or download fails, try simple embeddings
                if "address" in str(network_error).lower() or "download" in str(network_error).lower():
                    click.echo("Note: Using simple embeddings (network unavailable)")
                    semantic_analyzer = Semantic(corpus, use_simple_embeddings=True)
                else:
                    raise

            # Get similar document IDs
            similar_doc_ids = semantic_analyzer.get_similar_documents(
                document_ids=similar_docs,
                n_results=num,
                threshold=threshold
            )

            click.echo(f"✓ Found {len(similar_doc_ids)} similar documents")
            if similar_doc_ids:
                click.echo("\nSimilar Document IDs:")
                for doc_id in similar_doc_ids:
                    doc = corpus.get_document_by_id(doc_id)
                    doc_name = f" ({doc.name})" if doc and doc.name else ""
                    click.echo(f"  - {doc_id}{doc_name}")
                click.echo("\nHint: Use --doc-id to view individual documents")
                click.echo("Hint: This feature is useful for literature reviews to find similar documents")
            else:
                click.echo("No similar documents found above the threshold.")
                click.echo("Hint: Try lowering the threshold with --rec")

        except ImportError as e:
            click.echo(f"Error: {e}")
            click.echo("Install chromadb with: pip install chromadb")
        except Exception as e:
            click.echo(f"Error finding similar documents: {e}")


    # Semantic chunk search
    if semantic_chunks:
        if not doc_id:
            click.echo("Error: --doc-id is required when using --semantic-chunks")
        else:
            try:
                from .semantic import Semantic

                click.echo(f"\nPerforming semantic chunk search for: '{semantic_chunks}'")
                click.echo(f"Document ID: {doc_id}")
                click.echo(f"Threshold: {rec}")

                # Try with default embeddings first, fall back to simple embeddings
                try:
                    semantic_analyzer = Semantic(corpus)
                except Exception as network_error:
                    # If network error or download fails, try simple embeddings
                    if "address" in str(network_error).lower() or "download" in str(network_error).lower():
                        click.echo("Note: Using simple embeddings (network unavailable)")
                        semantic_analyzer = Semantic(corpus, use_simple_embeddings=True)
                    else:
                        raise

                # Get similar chunks
                chunks = semantic_analyzer.get_similar_chunks(
                    query=semantic_chunks,
                    doc_id=doc_id,
                    threshold=rec,
                    n_results=20  # Get more chunks to filter by threshold
                )

                click.echo(f"✓ Found {len(chunks)} matching chunks")
                click.echo("\nMatching chunks:")
                click.echo("=" * 60)
                for i, chunk in enumerate(chunks, 1):
                    click.echo(f"\nChunk {i}:")
                    click.echo(chunk)
                    click.echo("-" * 60)

                if len(chunks) == 0:
                    click.echo("No chunks matched the query above the threshold.")
                    click.echo("Hint: Try lowering the threshold with --rec or use a different query.")
                else:
                    click.echo(f"\nHint: These {len(chunks)} chunks can be used for coding/annotating the document.")
                    click.echo("Hint: Adjust --rec threshold to get more or fewer results.")

            except ImportError as e:
                click.echo(f"Error: {e}")
                click.echo("Install chromadb with: pip install chromadb")
            except Exception as e:
                click.echo(f"Error during semantic chunk search: {e}")

    # Export metadata as DataFrame
    if metadata_df:
        try:
            from .semantic import Semantic

            click.echo("\nExporting metadata as DataFrame...")
            # Try with default embeddings first, fall back to simple embeddings
            try:
                semantic_analyzer = Semantic(corpus)
            except Exception as network_error:
                # If network error or download fails, try simple embeddings
                if "address" in str(network_error).lower() or "download" in str(network_error).lower():
                    click.echo("Note: Using simple embeddings (network unavailable)")
                    semantic_analyzer = Semantic(corpus, use_simple_embeddings=True)
                else:
                    raise
            # Parse metadata_keys if provided
            keys_list = None
            if metadata_keys:
                keys_list = [k.strip() for k in metadata_keys.split(",")]
            corpus = semantic_analyzer.get_df(metadata_keys=keys_list)
            click.echo("✓ Metadata exported to DataFrame")
            if corpus.df is not None:
                click.echo(f"DataFrame shape: {corpus.df.shape}")
                click.echo(f"Columns: {list(corpus.df.columns)}")
            click.echo("Hint: Use --out to save the corpus with the updated DataFrame")
        except ImportError as e:
            click.echo(f"Error: {e}")
            click.echo("Install chromadb with: pip install chromadb")
        except Exception as e:
            click.echo(f"Error exporting metadata: {e}")

    # TDABM analysis
    if tdabm:
        try:
            # Parse tdabm parameter: y_variable:x_variables:radius
            parts = tdabm.split(":")
            if len(parts) < 2:
                raise click.ClickException(
                    "Invalid --tdabm format. Use 'y_variable:x_variables:radius' "
                    "(e.g., 'satisfaction:age,income:0.3'). Radius defaults to 0.3 if omitted."
                )

            y_var = parts[0].strip()
            x_vars = parts[1].strip()
            radius = 0.3  # default

            if len(parts) >= 3:
                try:
                    radius = float(parts[2].strip())
                except ValueError:
                    raise click.ClickException(f"Invalid radius value: '{parts[2]}'. Must be a number.")

            click.echo(f"\nPerforming TDABM analysis...")
            click.echo(f"  Y variable: {y_var}")
            click.echo(f"  X variables: {x_vars}")
            click.echo(f"  Radius: {radius}")

            tdabm_analyzer = Tdabm(corpus)
            result = tdabm_analyzer.generate_tdabm(y=y_var, x_variables=x_vars, radius=radius)

            click.echo("\n" + result)
            click.echo("\nHint: TDABM results stored in corpus metadata['tdabm']")
            click.echo("Hint: Use --out to save the corpus with TDABM metadata")
            click.echo("Hint: Use 'crispviz --tdabm' to visualize the results")

        except ValueError as e:
            click.echo(f"Error: {e}")
            click.echo("Hint: Ensure your corpus has a DataFrame with the specified variables")
            click.echo("Hint: Y variable must be continuous (not binary)")
            click.echo("Hint: X variables must be numeric/ordinal")
        except Exception as e:
            click.echo(f"Error during TDABM analysis: {e}")

    # Graph generation
    if graph:
        try:
            from .graph import CrispGraph

            click.echo("\nGenerating graph representation...")
            graph_gen = CrispGraph(corpus)
            graph_data = graph_gen.create_graph()

            click.echo(f"✓ Graph created successfully")
            click.echo(f"  Nodes: {graph_data['num_nodes']}")
            click.echo(f"  Edges: {graph_data['num_edges']}")
            click.echo(f"  Documents: {graph_data['num_documents']}")
            click.echo(f"  Has keywords: {graph_data['has_keywords']}")
            click.echo(f"  Has clusters: {graph_data['has_clusters']}")
            click.echo(f"  Has metadata: {graph_data['has_metadata']}")

            click.echo("\nHint: Graph data stored in corpus metadata['graph']")
            click.echo("Hint: Use --out to save the corpus with graph metadata")
            click.echo("Hint: Use 'crispviz --graph' to visualize the graph")

        except ValueError as e:
            click.echo(f"Error: {e}")
            click.echo("Hint: Make sure documents have keywords assigned first")
            click.echo("Hint: You can assign keywords using text analysis features")
        except Exception as e:
            click.echo(f"Error generating graph: {e}")
            logger.error(f"Graph generation error: {e}", exc_info=True)

    # Save corpus to --out if provided
    if out:
        from .read_data import ReadData

        rd = ReadData(corpus=corpus)
        rd.write_corpus_to_json(out, corpus=corpus)
        click.echo(f"✓ Corpus saved to {out}")

    if print_corpus:
        click.echo("\n=== Corpus Details ===")
        corpus.pretty_print()

    logger.info("Corpus CLI finished")

`main(verbose, inp, out, bins, topics_num, top_n, corr_columns, freq, by_topic, wordcloud, ldavis, top_terms, corr_heatmap, tdabm, graph, graph_nodes, graph_layout)` ¶

CRISP-T: Visualization CLI

Build corpus (source preferred over inp), optionally handle multiple sources, and export selected visualizations as PNG files into the output directory.

Source code in src/crisp_t/vizcli.py

@click.command()
@click.option("--verbose", "-v", is_flag=True, help="Print verbose messages.")
@click.option("--inp", "-i", help="Load corpus from a folder containing corpus.json")
@click.option(
    "--out",
    "-o",
    help="Output directory where PNG images will be written",
)
@click.option(
    "--bins", default=100, show_default=True, help="Number of bins for distributions"
)
@click.option(
    "--topics-num",
    default=8,
    show_default=True,
    help="Number of topics for LDA when required (default 8 as per Mettler et al. 2025)",
)
@click.option(
    "--top-n",
    default=20,
    show_default=True,
    help="Top N terms to show in top-terms chart",
)
@click.option(
    "--corr-columns",
    default="",
    help="Comma separated numeric columns for correlation heatmap; if empty, auto-select",
)
@click.option("--freq", is_flag=True, help="Export: word frequency distribution")
@click.option(
    "--by-topic",
    is_flag=True,
    help="Export: distribution by dominant topic (requires LDA)",
)
@click.option(
    "--wordcloud", is_flag=True, help="Export: topic wordcloud (requires LDA)"
)
@click.option(
    "--ldavis",
    is_flag=True,
    help="Export: interactive LDA visualization HTML (requires LDA)",
)
@click.option(
    "--top-terms", is_flag=True, help="Export: top terms bar chart (computed from text)"
)
@click.option(
    "--corr-heatmap",
    is_flag=True,
    help="Export: correlation heatmap (from CSV numeric columns)",
)
@click.option(
    "--tdabm",
    is_flag=True,
    help="Export: TDABM visualization (requires TDABM analysis in corpus metadata)",
)
@click.option(
    "--graph",
    is_flag=True,
    help="Export: graph visualization (requires graph data in corpus metadata)",
)
@click.option(
    "--graph-nodes",
    default="",
    help=(
        "Comma separated node types to include for graph: document,keyword,cluster,metadata. "
        "Example: --graph-nodes document,keyword. If empty or 'all', include all."
    ),
)
@click.option(
    "--graph-layout",
    default="spring",
    show_default=True,
    help="Layout algorithm for graph visualization: spring, circular, kamada_kawai, or spectral",
)
def main(
    verbose: bool,
    inp: Optional[str],
    out: str,
    bins: int,
    topics_num: int,
    top_n: int,
    corr_columns: str,
    freq: bool,
    by_topic: bool,
    wordcloud: bool,
    ldavis: bool,
    top_terms: bool,
    corr_heatmap: bool,
    tdabm: bool,
    graph: bool,
    graph_nodes: str,
    graph_layout: str,
):
    """CRISP-T: Visualization CLI

    Build corpus (source preferred over inp), optionally handle multiple sources,
    and export selected visualizations as PNG files into the output directory.
    """

    if verbose:
        logging.getLogger().setLevel(logging.DEBUG)
        click.echo("Verbose mode enabled")

    click.echo("_________________________________________")
    click.echo("CRISP-T: Visualizations")
    click.echo(f"Version: {__version__}")
    click.echo("_________________________________________")

    try:
        out_dir = Path(out)
    except TypeError:
        click.echo(
            f"No output directory specified. Visualizations need an output folder."
        )
        raise click.Abort()
    out_dir.mkdir(parents=True, exist_ok=True)

    # Initialize components
    read_data = ReadData()
    corpus = None

    corpus = initialize_corpus(inp=inp)

    if not corpus:
        raise click.ClickException("No input provided. Use --source/--sources or --inp")

    viz = QRVisualize(corpus=corpus)

    # Helper: build LDA if by-topic or wordcloud requested
    cluster_instance = None

    def ensure_topics():
        nonlocal cluster_instance
        if cluster_instance is None:
            cluster_instance = Cluster(corpus=corpus)
            cluster_instance.build_lda_model(topics=topics_num)
            # Populate visualization structures used by QRVisualize
            cluster_instance.format_topics_sentences(visualize=True)
        return cluster_instance

    # 1) Word frequency distribution
    if freq:
        df_text = pd.DataFrame(
            {"Text": [getattr(doc, "text", "") or "" for doc in corpus.documents]}
        )
        out_path = out_dir / "word_frequency.png"
        viz.plot_frequency_distribution_of_words(
            df=df_text, folder_path=str(out_path), bins=bins, show=False
        )
        click.echo(f"Saved: {out_path}")

    # 2) Distribution by topic (requires topics)
    if by_topic:
        ensure_topics()
        out_path = out_dir / "by_topic.png"
        viz.plot_distribution_by_topic(
            df=None, folder_path=str(out_path), bins=bins, show=False
        )
        click.echo(f"Saved: {out_path}")

    # 3) Topic wordcloud (requires topics)
    if wordcloud:
        ensure_topics()
        out_path = out_dir / "wordcloud.png"
        viz.plot_wordcloud(topics=None, folder_path=str(out_path), show=False)
        click.echo(f"Saved: {out_path}")

    # 3.5) LDA visualization (requires topics)
    if ldavis:
        cluster = ensure_topics()
        out_path = out_dir / "lda_visualization.html"
        try:
            viz.get_lda_viz(
                lda_model=cluster._lda_model,
                corpus_bow=cluster._bag_of_words,
                dictionary=cluster._dictionary,
                folder_path=str(out_path),
                show=False,
            )
            click.echo(f"Saved: {out_path}")
        except ImportError as e:
            click.echo(f"Warning: {e}")
        except Exception as e:
            click.echo(f"Error generating LDA visualization: {e}")

    # 4) Top terms (compute from text directly)
    if top_terms:
        texts = [getattr(doc, "text", "") or "" for doc in corpus.documents]
        tokens = []
        for t in texts:
            tokens.extend((t or "").lower().split())
        freq_map = Counter(tokens)
        if not freq_map:
            click.echo("No tokens found to plot top terms.")
        else:
            df_terms = pd.DataFrame(
                {
                    "term": list(freq_map.keys()),
                    "frequency": list(freq_map.values()),
                }
            )
            # QRVisualize sorts internally; we just pass full DF
            out_path = out_dir / "top_terms.png"
            viz.plot_top_terms(
                df=df_terms, top_n=top_n, folder_path=str(out_path), show=False
            )
            click.echo(f"Saved: {out_path}")

    # 5) Correlation heatmap
    if corr_heatmap:
        if getattr(corpus, "df", None) is None or corpus.df.empty:
            click.echo("No CSV data available for correlation heatmap; skipping.")
        else:
            df0 = corpus.df.copy()
            # If user specified columns, attempt to use them; else let visualize auto-select
            cols = (
                [c.strip() for c in corr_columns.split(",") if c.strip()]
                if corr_columns
                else None
            )
            out_path = out_dir / "corr_heatmap.png"
            if cols:
                # Pass subset to avoid rename ambiguity
                sub = (
                    df0[cols].copy().select_dtypes(include=["number"])
                )  # ensure numeric
                viz.plot_correlation_heatmap(
                    df=sub, columns=None, folder_path=str(out_path), show=False
                )
            else:
                viz.plot_correlation_heatmap(
                    df=df0, columns=None, folder_path=str(out_path), show=False
                )
            click.echo(f"Saved: {out_path}")

    # TDABM visualization
    if tdabm:
        if "tdabm" not in corpus.metadata:
            click.echo("Warning: No TDABM data found in corpus metadata.")
            click.echo(
                "Hint: Run TDABM analysis first with: crispt --tdabm y_var:x_vars:radius --inp <corpus_dir>"
            )
        else:
            out_path = out_dir / "tdabm.png"
            try:
                viz.draw_tdabm(corpus=corpus, folder_path=str(out_path), show=False)
                click.echo(f"Saved: {out_path}")
            except Exception as e:
                click.echo(f"Error generating TDABM visualization: {e}")
                logger.error(f"TDABM visualization error: {e}", exc_info=True)

    # Graph visualization (filtered by node types if provided)
    if graph or graph_nodes:
        if "graph" not in corpus.metadata:
            click.echo("Warning: No graph data found in corpus metadata.")
            click.echo(
                "Hint: Run graph generation first with: crispt --graph --inp <corpus_dir>"
            )
        else:
            raw_types = (graph_nodes or "").strip().lower()
            include_all = raw_types in ("", "all", "*")
            allowed_types = {"document", "keyword", "cluster", "metadata"}
            requested_types = set()
            if not include_all:
                for part in raw_types.split(","):
                    p = part.strip()
                    if not p:
                        continue
                    if p in allowed_types:
                        requested_types.add(p)
                    else:
                        click.echo(
                            f"Warning: Unknown node type '{p}' ignored. Allowed: {', '.join(sorted(allowed_types))}"
                        )
                if not requested_types:
                    click.echo("No valid node types specified; defaulting to all.")
                    include_all = True

            graph_data = corpus.metadata.get("graph", {})
            nodes = graph_data.get("nodes", [])
            edges = graph_data.get("edges", [])

            if include_all:
                filtered_nodes = nodes
                filtered_edges = edges
            else:
                filtered_nodes = [n for n in nodes if n.get("label") in requested_types]
                kept_ids = {str(n.get("id")) for n in filtered_nodes}
                filtered_edges = [
                    e
                    for e in edges
                    if str(e.get("source")) in kept_ids
                    and str(e.get("target")) in kept_ids
                ]

            # Build a shallow copy of graph metadata with filtered components
            filtered_graph_meta = dict(graph_data)
            filtered_graph_meta["nodes"] = filtered_nodes
            filtered_graph_meta["edges"] = filtered_edges
            filtered_graph_meta["num_nodes"] = len(filtered_nodes)
            filtered_graph_meta["num_edges"] = len(filtered_edges)
            filtered_graph_meta["num_documents"] = sum(
                1 for n in filtered_nodes if n.get("label") == "document"
            )

            # Inject temporary filtered metadata for visualization
            original_graph_meta = corpus.metadata.get("graph")
            corpus.metadata["graph"] = filtered_graph_meta
            out_path = out_dir / "graph.png"
            try:
                viz.draw_graph(
                    corpus=corpus,
                    folder_path=str(out_path),
                    show=False,
                    layout=graph_layout,
                )
                click.echo(f"Saved: {out_path}")
                if not include_all:
                    click.echo(
                        f"Graph filtered to node types: {', '.join(sorted(requested_types))}"
                    )
            except Exception as e:
                click.echo(f"Error generating graph visualization: {e}")
                logger.error(f"Graph visualization error: {e}", exc_info=True)
            finally:
                # Restore original metadata (avoid side-effects)
                corpus.metadata["graph"] = original_graph_meta

    click.echo("\n=== Visualization Complete ===")

This file is part of crisp-t.

You should have received a copy of the GNU General Public License along with crisp-t. If not, see https://www.gnu.org/licenses/.

`Corpus` ¶

Bases: BaseModel

Corpus model for storing a collection of documents.

Source code in src/crisp_t/model/corpus.py

class Corpus(BaseModel):
    """
    Corpus model for storing a collection of documents.
    """

    id: str = Field(..., description="Unique identifier for the corpus.")
    name: Optional[str] = Field(None, description="Name of the corpus.")
    description: Optional[str] = Field(None, description="Description of the corpus.")
    score: Optional[float] = Field(
        None, description="Score associated with the corpus."
    )
    documents: list[Document] = Field(
        default_factory=list, description="List of documents in the corpus."
    )
    df: Optional[pd.DataFrame] = Field(
        None, description="Numeric data associated with the corpus."
    )
    visualization: Dict[str, Any] = Field(
        default_factory=dict, description="Visualization data associated with the corpus."
    )
    model_config = ConfigDict(
        arbitrary_types_allowed=True
    )  # required for pandas DataFrame
    metadata: dict = Field(
        default_factory=dict, description="Metadata associated with the corpus."
    )

    def pretty_print(self, show="all"):
        """
        Print the corpus information in a human-readable format.

        Args:
            show: Display option. Can be:
                - "all": Show all corpus information
                - "documents": Show first 5 documents
                - "documents N": Show first N documents (e.g., "documents 10")
                - "documents metadata": Show document-specific metadata
                - "dataframe": Show DataFrame head
                - "dataframe metadata": Show DataFrame metadata columns (metadata_*)
                - "dataframe stats": Show DataFrame statistics
                - "metadata": Show all corpus metadata
                - "metadata KEY": Show specific metadata key (e.g., "metadata pca")
                - "stats": Show DataFrame statistics (deprecated, use "dataframe stats")
        """
        # Color codes for terminal output
        BLUE = "\033[94m"
        GREEN = "\033[92m"
        YELLOW = "\033[93m"
        CYAN = "\033[96m"
        MAGENTA = "\033[95m"
        RED = "\033[91m"
        RESET = "\033[0m"
        BOLD = "\033[1m"

        # Parse the show parameter to support subcommands
        parts = show.split(maxsplit=1)
        main_command = parts[0]
        sub_command = parts[1] if len(parts) > 1 else None

        # Print basic corpus info for most commands
        if main_command in ["all", "documents", "dataframe", "metadata"]:
            print(f"{BOLD}{BLUE}Corpus ID:{RESET} {self.id}")
            print(f"{BOLD}{BLUE}Name:{RESET} {self.name}")
            print(f"{BOLD}{BLUE}Description:{RESET} {self.description}")

        # Handle documents command
        if main_command in ["all", "documents"]:
            if sub_command == "metadata":
                # Show document-specific metadata
                print(f"\n{BOLD}{GREEN}=== Document Metadata ==={RESET}")
                if not self.documents:
                    print("No documents in corpus")
                else:
                    for i, doc in enumerate(self.documents, 1):
                        print(f"\n{CYAN}Document {i}:{RESET}")
                        print(f"  {BOLD}ID:{RESET} {doc.id}")
                        print(f"  {BOLD}Name:{RESET} {doc.name}")
                        if doc.metadata:
                            print(f"  {BOLD}Metadata:{RESET}")
                            for key, value in doc.metadata.items():
                                # Truncate long values
                                val_str = str(value)
                                if len(val_str) > 100:
                                    val_str = val_str[:97] + "..."
                                print(f"    {YELLOW}{key}:{RESET} {val_str}")
                        else:
                            print(f"  {BOLD}Metadata:{RESET} (none)")
            else:
                # Determine how many documents to show
                num_docs = 5  # default
                if sub_command:
                    try:
                        num_docs = int(sub_command)
                    except ValueError:
                        print(f"{RED}Invalid number for documents: {sub_command}. Using default (5).{RESET}")

                print(f"\n{BOLD}{GREEN}=== Documents ==={RESET}")
                print(f"Total documents: {len(self.documents)}")
                print(f"Showing first {min(num_docs, len(self.documents))} document(s):\n")

                for i, doc in enumerate(self.documents[:num_docs], 1):
                    print(f"{CYAN}Document {i}:{RESET}")
                    print(f"  {BOLD}Name:{RESET} {doc.name}")
                    print(f"  {BOLD}ID:{RESET} {doc.id}")
                    # Show a snippet of text if available
                    if hasattr(doc, 'text') and doc.text:
                        text_snippet = doc.text[:200] + "..." if len(doc.text) > 200 else doc.text
                        print(f"  {BOLD}Text:{RESET} {text_snippet}")
                    print()

        # Handle dataframe command
        if main_command in ["all", "dataframe"]:
            if self.df is not None:
                if sub_command == "metadata":
                    # Show DataFrame metadata columns (columns starting with metadata_)
                    print(f"\n{BOLD}{GREEN}=== DataFrame Metadata Columns ==={RESET}")
                    metadata_cols = [col for col in self.df.columns if col.startswith("metadata_")]
                    if metadata_cols:
                        print(f"Found {len(metadata_cols)} metadata column(s):")
                        for col in metadata_cols:
                            print(f"  {YELLOW}{col}{RESET}")
                            # Show some statistics for the metadata column
                            print(f"    Non-null values: {self.df[col].notna().sum()}")
                            print(f"    Null values: {self.df[col].isna().sum()}")
                            # Show unique values if not too many
                            unique_count = self.df[col].nunique()
                            if unique_count <= 10:
                                print(f"    Unique values ({unique_count}): {list(self.df[col].unique())}")
                            else:
                                print(f"    Unique values: {unique_count}")
                    else:
                        print("No metadata columns found (columns starting with 'metadata_')")
                elif sub_command == "stats":
                    # Show DataFrame statistics
                    self._print_dataframe_stats()
                else:
                    # Show DataFrame head
                    print(f"\n{BOLD}{GREEN}=== DataFrame ==={RESET}")
                    print(f"Shape: {self.df.shape}")
                    print(f"Columns: {list(self.df.columns)}")
                    print("\nFirst few rows:")
                    print(self.df.head())
            else:
                if main_command == "dataframe":
                    print(f"\n{BOLD}{RED}No DataFrame available{RESET}")

        # Handle metadata command
        if main_command in ["all", "metadata"]:
            if sub_command:
                # Show specific metadata key
                print(f"\n{BOLD}{GREEN}=== Metadata: {sub_command} ==={RESET}")
                if sub_command in self.metadata:
                    value = self.metadata[sub_command]
                    # Format the output based on the type of value
                    if isinstance(value, dict):
                        for k, v in value.items():
                            print(f"{YELLOW}{k}:{RESET} {v}")
                    elif isinstance(value, list):
                        for i, item in enumerate(value, 1):
                            print(f"{i}. {item}")
                    else:
                        print(value)
                else:
                    print(f"{RED}Metadata key '{sub_command}' not found{RESET}")
                    available_keys = list(self.metadata.keys())
                    if available_keys:
                        print(f"Available keys: {', '.join(available_keys)}")
            else:
                # Show all metadata
                print(f"\n{BOLD}{GREEN}=== Corpus Metadata ==={RESET}")
                if not self.metadata:
                    print("No metadata available")
                else:
                    for key, value in self.metadata.items():
                        print(f"\n{MAGENTA}{key}:{RESET}")
                        # Truncate long values
                        val_str = str(value)
                        if len(val_str) > 500:
                            val_str = val_str[:497] + "..."
                        print(f"  {val_str}")

        # Handle stats command (deprecated, redirect to dataframe stats)
        if main_command == "stats":
            print(f"{YELLOW}Note: 'stats' is deprecated. Use 'dataframe stats' instead.{RESET}")
            if self.df is not None:
                self._print_dataframe_stats()
            else:
                print(f"{RED}No DataFrame available{RESET}")

        print(f"\n{BOLD}Display completed for '{show}'{RESET}")

    def _print_dataframe_stats(self):
        """Helper method to print DataFrame statistics."""
        YELLOW = "\033[93m"
        BOLD = "\033[1m"
        RESET = "\033[0m"
        GREEN = "\033[92m"

        print(f"\n{BOLD}{GREEN}=== DataFrame Statistics ==={RESET}")
        print(self.df.describe())
        print(f"\n{BOLD}Distinct values per column:{RESET}")
        for col in self.df.columns:
            nunique = self.df[col].nunique()
            print(f"  {YELLOW}{col}:{RESET} {nunique} distinct value(s)")
            # If distinct values < 10, show value counts
            if nunique <= 10:
                print(f"    Value counts:")
                for val, count in self.df[col].value_counts().items():
                    print(f"      {val}: {count}")
                print()
    def get_all_df_column_names(self):
        """
        Get a list of all column names in the DataFrame.

        Returns:
            List of column names.
        """
        if self.df is not None:
            return self.df.columns.tolist()
        return []

    def get_descriptive_statistics(self):
        """
        Get descriptive statistics of the DataFrame.

        Returns:
            DataFrame containing descriptive statistics, or None if DataFrame is None.
        """
        if self.df is not None:
            return self.df.describe()
        return None

    def get_row_count(self):
        """
        Get the number of rows in the DataFrame.

        Returns:
            Number of rows in the DataFrame, or 0 if DataFrame is None.
        """
        if self.df is not None:
            return len(self.df)
        return 0

    def get_row_by_index(self, index: int) -> Optional[pd.Series]:
        """
        Get a row from the DataFrame by its index.

        Args:
            index: Index of the row to retrieve.
        Returns:
            Row as a pandas Series if index is valid, else None.
        """
        if self.df is not None and 0 <= index < len(self.df):
            return self.df.iloc[index]
        return None

    def get_all_document_ids(self):
        """
        Get a list of all document IDs in the corpus.

        Returns:
            List of document IDs.
        """
        return [doc.id for doc in self.documents]

    def get_document_by_id(self, document_id: str) -> Optional[Document]:
        """
        Get a document by its ID.

        Args:
            document_id: ID of the document to retrieve.

        Returns:
            Document object if found, else None.
        """
        for doc in self.documents:
            if doc.id == document_id:
                return doc
        return None

    def add_document(self, document: Document):
        """
        Add a document to the corpus.

        Args:
            document: Document object to add.
        """
        self.documents.append(document)

    def remove_document_by_id(self, document_id: str):
        """
        Remove a document from the corpus by its ID.

        Args:
            document_id: ID of the document to remove.
        """
        self.documents = [
            doc for doc in self.documents if doc.id != document_id
        ]

    def update_metadata(self, key: str, value: Any):
        """
        Update the metadata of the corpus.

        Args:
            key: Metadata key to update.
            value: New value for the metadata key.
        """
        self.metadata[key] = value

    def add_relationship(self, first: str, second: str, relation: str):
        """
        Add a relationship between two documents in the corpus.

        Args:
            first: keywords from text documents in the format text:keyword or columns from dataframe in the format numb:column
            second: keywords from text documents in the format text:keyword or columns from dataframe in the format numb:column
            relation: Description of the relationship. (One of "correlates", "similar to", "cites", "references", "contradicts", etc.)
        """
        if "relationships" not in self.metadata:
            self.metadata["relationships"] = []
        self.metadata["relationships"].append(
            {"first": first, "second": second, "relation": relation}
        )

    def clear_relationships(self):
        """
        Clear all relationships in the corpus metadata.
        """
        if "relationships" in self.metadata:
            self.metadata["relationships"] = []

    def get_relationships(self):
        """
        Get all relationships in the corpus metadata.

        Returns:
            List of relationships, or empty list if none exist.
        """
        return self.metadata.get("relationships", [])

    def get_all_relationships_for_keyword(self, keyword: str):
        """
        Get all relationships involving a specific keyword.

        Args:
            keyword: Keyword to search for in relationships.

        Returns:
            List of relationships involving the keyword.
        """
        rels = self.get_relationships()
        return [
            rel
            for rel in rels
            if keyword in rel["first"] or keyword in rel["second"]
        ]

`add_document(document)` ¶

Add a document to the corpus.

Parameters:

Name	Type	Description	Default
`document`	`Document`	Document object to add.	required

Source code in src/crisp_t/model/corpus.py

def add_document(self, document: Document):
    """
    Add a document to the corpus.

    Args:
        document: Document object to add.
    """
    self.documents.append(document)

`add_relationship(first, second, relation)` ¶

Add a relationship between two documents in the corpus.

Parameters:

Name	Type	Description	Default
`first`	`str`	keywords from text documents in the format text:keyword or columns from dataframe in the format numb:column	required
`second`	`str`	keywords from text documents in the format text:keyword or columns from dataframe in the format numb:column	required
`relation`	`str`	Description of the relationship. (One of "correlates", "similar to", "cites", "references", "contradicts", etc.)	required

Source code in src/crisp_t/model/corpus.py

def add_relationship(self, first: str, second: str, relation: str):
    """
    Add a relationship between two documents in the corpus.

    Args:
        first: keywords from text documents in the format text:keyword or columns from dataframe in the format numb:column
        second: keywords from text documents in the format text:keyword or columns from dataframe in the format numb:column
        relation: Description of the relationship. (One of "correlates", "similar to", "cites", "references", "contradicts", etc.)
    """
    if "relationships" not in self.metadata:
        self.metadata["relationships"] = []
    self.metadata["relationships"].append(
        {"first": first, "second": second, "relation": relation}
    )

`clear_relationships()` ¶

Clear all relationships in the corpus metadata.

Source code in src/crisp_t/model/corpus.py

def clear_relationships(self):
    """
    Clear all relationships in the corpus metadata.
    """
    if "relationships" in self.metadata:
        self.metadata["relationships"] = []

`get_all_df_column_names()` ¶

Get a list of all column names in the DataFrame.

Returns:

Type	Description
	List of column names.

Source code in src/crisp_t/model/corpus.py

def get_all_df_column_names(self):
    """
    Get a list of all column names in the DataFrame.

    Returns:
        List of column names.
    """
    if self.df is not None:
        return self.df.columns.tolist()
    return []

`get_all_document_ids()` ¶

Get a list of all document IDs in the corpus.

Returns:

Type	Description
	List of document IDs.

Source code in src/crisp_t/model/corpus.py

def get_all_document_ids(self):
    """
    Get a list of all document IDs in the corpus.

    Returns:
        List of document IDs.
    """
    return [doc.id for doc in self.documents]

`get_all_relationships_for_keyword(keyword)` ¶

Get all relationships involving a specific keyword.

Parameters:

Name	Type	Description	Default
`keyword`	`str`	Keyword to search for in relationships.	required

Returns:

Type	Description
	List of relationships involving the keyword.

Source code in src/crisp_t/model/corpus.py

def get_all_relationships_for_keyword(self, keyword: str):
    """
    Get all relationships involving a specific keyword.

    Args:
        keyword: Keyword to search for in relationships.

    Returns:
        List of relationships involving the keyword.
    """
    rels = self.get_relationships()
    return [
        rel
        for rel in rels
        if keyword in rel["first"] or keyword in rel["second"]
    ]

`get_descriptive_statistics()` ¶

Get descriptive statistics of the DataFrame.

Returns:

Type	Description
	DataFrame containing descriptive statistics, or None if DataFrame is None.

Source code in src/crisp_t/model/corpus.py

def get_descriptive_statistics(self):
    """
    Get descriptive statistics of the DataFrame.

    Returns:
        DataFrame containing descriptive statistics, or None if DataFrame is None.
    """
    if self.df is not None:
        return self.df.describe()
    return None

`get_document_by_id(document_id)` ¶

Get a document by its ID.

Parameters:

Name	Type	Description	Default
`document_id`	`str`	ID of the document to retrieve.	required

Returns:

Type	Description
`Optional[Document]`	Document object if found, else None.

Source code in src/crisp_t/model/corpus.py

def get_document_by_id(self, document_id: str) -> Optional[Document]:
    """
    Get a document by its ID.

    Args:
        document_id: ID of the document to retrieve.

    Returns:
        Document object if found, else None.
    """
    for doc in self.documents:
        if doc.id == document_id:
            return doc
    return None

`get_relationships()` ¶

Get all relationships in the corpus metadata.

Returns:

Type	Description
	List of relationships, or empty list if none exist.

Source code in src/crisp_t/model/corpus.py

def get_relationships(self):
    """
    Get all relationships in the corpus metadata.

    Returns:
        List of relationships, or empty list if none exist.
    """
    return self.metadata.get("relationships", [])

`get_row_by_index(index)` ¶

Get a row from the DataFrame by its index.

Parameters:

Name	Type	Description	Default
`index`	`int`	Index of the row to retrieve.	required

Returns: Row as a pandas Series if index is valid, else None.

Source code in src/crisp_t/model/corpus.py

def get_row_by_index(self, index: int) -> Optional[pd.Series]:
    """
    Get a row from the DataFrame by its index.

    Args:
        index: Index of the row to retrieve.
    Returns:
        Row as a pandas Series if index is valid, else None.
    """
    if self.df is not None and 0 <= index < len(self.df):
        return self.df.iloc[index]
    return None

`get_row_count()` ¶

Get the number of rows in the DataFrame.

Returns:

Type	Description
	Number of rows in the DataFrame, or 0 if DataFrame is None.

Source code in src/crisp_t/model/corpus.py

def get_row_count(self):
    """
    Get the number of rows in the DataFrame.

    Returns:
        Number of rows in the DataFrame, or 0 if DataFrame is None.
    """
    if self.df is not None:
        return len(self.df)
    return 0

`pretty_print(show='all')` ¶

Print the corpus information in a human-readable format.

Parameters:

Name	Type	Description	Default
`show`		Display option. Can be: - "all": Show all corpus information - "documents": Show first 5 documents - "documents N": Show first N documents (e.g., "documents 10") - "documents metadata": Show document-specific metadata - "dataframe": Show DataFrame head - "dataframe metadata": Show DataFrame metadata columns (metadata_*) - "dataframe stats": Show DataFrame statistics - "metadata": Show all corpus metadata - "metadata KEY": Show specific metadata key (e.g., "metadata pca") - "stats": Show DataFrame statistics (deprecated, use "dataframe stats")	`'all'`

Source code in src/crisp_t/model/corpus.py

def pretty_print(self, show="all"):
    """
    Print the corpus information in a human-readable format.

    Args:
        show: Display option. Can be:
            - "all": Show all corpus information
            - "documents": Show first 5 documents
            - "documents N": Show first N documents (e.g., "documents 10")
            - "documents metadata": Show document-specific metadata
            - "dataframe": Show DataFrame head
            - "dataframe metadata": Show DataFrame metadata columns (metadata_*)
            - "dataframe stats": Show DataFrame statistics
            - "metadata": Show all corpus metadata
            - "metadata KEY": Show specific metadata key (e.g., "metadata pca")
            - "stats": Show DataFrame statistics (deprecated, use "dataframe stats")
    """
    # Color codes for terminal output
    BLUE = "\033[94m"
    GREEN = "\033[92m"
    YELLOW = "\033[93m"
    CYAN = "\033[96m"
    MAGENTA = "\033[95m"
    RED = "\033[91m"
    RESET = "\033[0m"
    BOLD = "\033[1m"

    # Parse the show parameter to support subcommands
    parts = show.split(maxsplit=1)
    main_command = parts[0]
    sub_command = parts[1] if len(parts) > 1 else None

    # Print basic corpus info for most commands
    if main_command in ["all", "documents", "dataframe", "metadata"]:
        print(f"{BOLD}{BLUE}Corpus ID:{RESET} {self.id}")
        print(f"{BOLD}{BLUE}Name:{RESET} {self.name}")
        print(f"{BOLD}{BLUE}Description:{RESET} {self.description}")

    # Handle documents command
    if main_command in ["all", "documents"]:
        if sub_command == "metadata":
            # Show document-specific metadata
            print(f"\n{BOLD}{GREEN}=== Document Metadata ==={RESET}")
            if not self.documents:
                print("No documents in corpus")
            else:
                for i, doc in enumerate(self.documents, 1):
                    print(f"\n{CYAN}Document {i}:{RESET}")
                    print(f"  {BOLD}ID:{RESET} {doc.id}")
                    print(f"  {BOLD}Name:{RESET} {doc.name}")
                    if doc.metadata:
                        print(f"  {BOLD}Metadata:{RESET}")
                        for key, value in doc.metadata.items():
                            # Truncate long values
                            val_str = str(value)
                            if len(val_str) > 100:
                                val_str = val_str[:97] + "..."
                            print(f"    {YELLOW}{key}:{RESET} {val_str}")
                    else:
                        print(f"  {BOLD}Metadata:{RESET} (none)")
        else:
            # Determine how many documents to show
            num_docs = 5  # default
            if sub_command:
                try:
                    num_docs = int(sub_command)
                except ValueError:
                    print(f"{RED}Invalid number for documents: {sub_command}. Using default (5).{RESET}")

            print(f"\n{BOLD}{GREEN}=== Documents ==={RESET}")
            print(f"Total documents: {len(self.documents)}")
            print(f"Showing first {min(num_docs, len(self.documents))} document(s):\n")

            for i, doc in enumerate(self.documents[:num_docs], 1):
                print(f"{CYAN}Document {i}:{RESET}")
                print(f"  {BOLD}Name:{RESET} {doc.name}")
                print(f"  {BOLD}ID:{RESET} {doc.id}")
                # Show a snippet of text if available
                if hasattr(doc, 'text') and doc.text:
                    text_snippet = doc.text[:200] + "..." if len(doc.text) > 200 else doc.text
                    print(f"  {BOLD}Text:{RESET} {text_snippet}")
                print()

    # Handle dataframe command
    if main_command in ["all", "dataframe"]:
        if self.df is not None:
            if sub_command == "metadata":
                # Show DataFrame metadata columns (columns starting with metadata_)
                print(f"\n{BOLD}{GREEN}=== DataFrame Metadata Columns ==={RESET}")
                metadata_cols = [col for col in self.df.columns if col.startswith("metadata_")]
                if metadata_cols:
                    print(f"Found {len(metadata_cols)} metadata column(s):")
                    for col in metadata_cols:
                        print(f"  {YELLOW}{col}{RESET}")
                        # Show some statistics for the metadata column
                        print(f"    Non-null values: {self.df[col].notna().sum()}")
                        print(f"    Null values: {self.df[col].isna().sum()}")
                        # Show unique values if not too many
                        unique_count = self.df[col].nunique()
                        if unique_count <= 10:
                            print(f"    Unique values ({unique_count}): {list(self.df[col].unique())}")
                        else:
                            print(f"    Unique values: {unique_count}")
                else:
                    print("No metadata columns found (columns starting with 'metadata_')")
            elif sub_command == "stats":
                # Show DataFrame statistics
                self._print_dataframe_stats()
            else:
                # Show DataFrame head
                print(f"\n{BOLD}{GREEN}=== DataFrame ==={RESET}")
                print(f"Shape: {self.df.shape}")
                print(f"Columns: {list(self.df.columns)}")
                print("\nFirst few rows:")
                print(self.df.head())
        else:
            if main_command == "dataframe":
                print(f"\n{BOLD}{RED}No DataFrame available{RESET}")

    # Handle metadata command
    if main_command in ["all", "metadata"]:
        if sub_command:
            # Show specific metadata key
            print(f"\n{BOLD}{GREEN}=== Metadata: {sub_command} ==={RESET}")
            if sub_command in self.metadata:
                value = self.metadata[sub_command]
                # Format the output based on the type of value
                if isinstance(value, dict):
                    for k, v in value.items():
                        print(f"{YELLOW}{k}:{RESET} {v}")
                elif isinstance(value, list):
                    for i, item in enumerate(value, 1):
                        print(f"{i}. {item}")
                else:
                    print(value)
            else:
                print(f"{RED}Metadata key '{sub_command}' not found{RESET}")
                available_keys = list(self.metadata.keys())
                if available_keys:
                    print(f"Available keys: {', '.join(available_keys)}")
        else:
            # Show all metadata
            print(f"\n{BOLD}{GREEN}=== Corpus Metadata ==={RESET}")
            if not self.metadata:
                print("No metadata available")
            else:
                for key, value in self.metadata.items():
                    print(f"\n{MAGENTA}{key}:{RESET}")
                    # Truncate long values
                    val_str = str(value)
                    if len(val_str) > 500:
                        val_str = val_str[:497] + "..."
                    print(f"  {val_str}")

    # Handle stats command (deprecated, redirect to dataframe stats)
    if main_command == "stats":
        print(f"{YELLOW}Note: 'stats' is deprecated. Use 'dataframe stats' instead.{RESET}")
        if self.df is not None:
            self._print_dataframe_stats()
        else:
            print(f"{RED}No DataFrame available{RESET}")

    print(f"\n{BOLD}Display completed for '{show}'{RESET}")

`remove_document_by_id(document_id)` ¶

Remove a document from the corpus by its ID.

Parameters:

Name	Type	Description	Default
`document_id`	`str`	ID of the document to remove.	required

Source code in src/crisp_t/model/corpus.py

def remove_document_by_id(self, document_id: str):
    """
    Remove a document from the corpus by its ID.

    Args:
        document_id: ID of the document to remove.
    """
    self.documents = [
        doc for doc in self.documents if doc.id != document_id
    ]

`update_metadata(key, value)` ¶

Update the metadata of the corpus.

Parameters:

Name	Type	Description	Default
`key`	`str`	Metadata key to update.	required
`value`	`Any`	New value for the metadata key.	required

Source code in src/crisp_t/model/corpus.py

def update_metadata(self, key: str, value: Any):
    """
    Update the metadata of the corpus.

    Args:
        key: Metadata key to update.
        value: New value for the metadata key.
    """
    self.metadata[key] = value

This file is part of crisp-t.

You should have received a copy of the GNU General Public License along with crisp-t. If not, see https://www.gnu.org/licenses/.

`Document` ¶

Bases: BaseModel

Document model for storing text and metadata.

Source code in src/crisp_t/model/document.py

class Document(BaseModel):
    """
    Document model for storing text and metadata.
    """
    id: str = Field(..., description="Unique identifier for the document.")
    name: Optional[str] = Field(None, description="Name of the corpus.")
    description: Optional[str] = Field(None, description="Description of the corpus.")
    score: float = Field(0.0, description="Score associated with the document.")
    text: str = Field(..., description="The text content of the document.")
    metadata: dict = Field(
        default_factory=dict, description="Metadata associated with the document."
    )

    def pretty_print(self):
        """
        Print the document information in a human-readable format.
        """
        print(f"Document ID: {self.id}")
        print(f"Name: {self.name}")
        print(f"Description: {self.description}")
        print(f"Score: {self.score}")
        print(f"Text: {self.text[:100]}...")  # Print first 100 characters of text
        print(f"Metadata: {self.metadata}")
        print(f"Length of Text: {len(self.text)} characters")
        print(f"Number of Metadata Entries: {len(self.metadata)}")

`pretty_print()` ¶

Print the document information in a human-readable format.

Source code in src/crisp_t/model/document.py

def pretty_print(self):
    """
    Print the document information in a human-readable format.
    """
    print(f"Document ID: {self.id}")
    print(f"Name: {self.name}")
    print(f"Description: {self.description}")
    print(f"Score: {self.score}")
    print(f"Text: {self.text[:100]}...")  # Print first 100 characters of text
    print(f"Metadata: {self.metadata}")
    print(f"Length of Text: {len(self.text)} characters")
    print(f"Number of Metadata Entries: {len(self.metadata)}")