TornadoExample.java

/*
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package tech.tablesaw.examples;

import static tech.tablesaw.aggregate.AggregateFunctions.countNonMissing;
import static tech.tablesaw.aggregate.AggregateFunctions.mean;
import static tech.tablesaw.aggregate.AggregateFunctions.median;
import static tech.tablesaw.aggregate.AggregateFunctions.min;
import static tech.tablesaw.aggregate.AggregateFunctions.sum;

import tech.tablesaw.api.DateColumn;
import tech.tablesaw.api.IntColumn;
import tech.tablesaw.api.StringColumn;
import tech.tablesaw.api.Table;
import tech.tablesaw.selection.Selection;

/** Usage example using a Tornado data set */
public class TornadoExample extends AbstractExample {

  public static void main(String[] args) throws Exception {

    Table tornadoes = Table.read().csv("../data/tornadoes_1950-2014.csv");
    assert (tornadoes != null);

    Table structure = tornadoes.structure();
    out(structure);
    out(structure.where(structure.stringColumn("Column Type").isEqualTo("DOUBLE")));

    tornadoes.setName("tornadoes");

    out();
    out("Column names");
    out(tornadoes.columnNames());

    out();
    out("Remove the 'State No' column");
    tornadoes.removeColumns("State No");
    out(tornadoes.columnNames());

    out();
    out("print the table's shape:");
    out(tornadoes.shape());

    out();
    out("Use first(3) to view the first 3 rows:");
    out(tornadoes.first(3));

    tornadoes = tornadoes.where(tornadoes.numberColumn("Start Lat").isGreaterThan(20f));

    out();
    out("Extract month from the date and make it a separate column");
    StringColumn month = tornadoes.dateColumn("Date").month();
    out(month.summary());

    out("Add the month column to the table");
    tornadoes.insertColumn(2, month);
    out(tornadoes.columnNames());

    out();
    out("Filtering: Tornadoes where there were fatalities");
    Table fatal = tornadoes.where(tornadoes.numberColumn("Fatalities").isGreaterThan(0));
    out(fatal.shape());

    out();
    out(fatal.first(5));

    out();
    out("Total fatalities: " + fatal.numberColumn("Fatalities").sum());

    out();
    out("Sorting on Fatalities in descending order");
    fatal = fatal.sortDescendingOn("Fatalities");
    out(fatal.first(5));

    out("");
    out("Calculating basic descriptive statistics on Fatalities");
    out(fatal.numberColumn("Fatalities").summary());

    // TODO(lwhite): Provide a param for title of the new table (or auto-generate a better one).
    Table injuriesByScale = tornadoes.summarize("Injuries", median).by("Scale");
    Table fob = tornadoes.summarize("Injuries", min).by("Scale", "State");
    out(fob);
    injuriesByScale.setName("Median injuries by Tornado Scale");
    out(injuriesByScale);

    injuriesByScale = tornadoes.summarize("Injuries", mean).by("Scale");
    injuriesByScale.setName("Average injuries by Tornado Scale");
    out(injuriesByScale);

    // TODO(lwhite): Provide a param for title of the new table (or auto-generate a better one).
    Table injuriesByScaleState = tornadoes.summarize("Injuries", median).by("Scale", "State");
    injuriesByScaleState.setName("Median injuries by Tornado Scale and State");
    out(injuriesByScaleState);

    Table injuriesByScaleState2 = tornadoes.summarize("Injuries", sum).by("State", "Scale");
    injuriesByScaleState2.setName("Total injuries by Tornado Scale and State");
    out(injuriesByScaleState2);

    // Average days between tornadoes in the summer

    // alternate, somewhat more precise approach
    DateColumn date = tornadoes.dateColumn("Date");

    Selection summerFilter =
        date.month()
            .isIn("JULY", "AUGUST")
            .or(date.month().isEqualTo("JUNE").and(date.dayOfMonth().isGreaterThanOrEqualTo(21)))
            .or(date.month().isEqualTo("SEPTEMBER").and(date.dayOfMonth().isLessThanOrEqualTo(22)));

    // Table summer = tornadoes.select(selection);
    Table summer = tornadoes.where(summerFilter);
    summer = summer.sortAscendingOn("Date", "Time");
    summer.addColumns(summer.dateColumn("Date").lag(1));

    // calculate the difference between a date and the prior date using the lagged column
    DateColumn summerDate = summer.dateColumn("Date");
    DateColumn laggedDate = summer.dateColumn("Date lag(1)");
    IntColumn delta = laggedDate.daysUntil(summerDate); // the lagged date is earlier
    summer.addColumns(delta);

    out(summer.first(4));

    // now we can summarize by year so we don't inadvertently include differences between multiple
    // years
    Table summary = summer.summarize(delta, mean, countNonMissing).by(summerDate.year());
    out(summary);

    // taking the mean of the annual means gives us an approximate answer
    // we could also use the count value calculated above to get a weighted average
    out(summary.nCol(1).mean());

    out();

    out("Writing the revised table to a new csv file");
    tornadoes.write().csv("../data/rev_tornadoes_1950-2014.csv");
  }
}