// newgtlds is a utility command that downloads the list of gTLDs from ICANN // and formats it into the PSL format, writing to stdout. package tools import ( "bytes" "encoding/json" "errors" "flag" "fmt" "io" "net/http" "net/url" "os" "strings" "text/template" "time" ) const ( // ICANN_GTLD_JSON_URL is the URL for the ICANN gTLD JSON registry (version // 2). See https://www.icann.org/resources/pages/registries/registries-en for // more information. ICANN_GTLD_JSON_URL = "https://www.icann.org/resources/registries/gtlds/v2/gtlds.json" // IANA_TLDS_TXT_URL is the URL for the IANA "Public Suffix List" of TLDs // in the ICP-3 Root - including new ccTLDs, EBRERO gTLDS or things not in // the JSON File above that should be included in the PSL. Note: UPPERCASE IANA_TLDS_TXT_URL = "http://data.iana.org/TLD/tlds-alpha-by-domain.txt" // IANA_TLD_URL_BASE is the base URL for IANA domain information pages. IANA_TLD_URL_BASE = "https://www.iana.org/domains/root/db" // PSL_GTLDS_SECTION_HEADER marks the start of the newGTLDs section of the // overall public suffix dat file. PSL_GTLDS_SECTION_HEADER = "// newGTLDs" // PSL_GTLDS_SECTION_FOOTER marks the end of the newGTLDs section of the // overall public suffix dat file. PSL_GTLDS_SECTION_FOOTER = "// ===END ICANN DOMAINS===" ) var ( // legacyGTLDs are gTLDs that predate ICANN's new gTLD program. These legacy // gTLDs are present in the ICANN_GTLD_JSON_URL data but we do not want to // include them in the new gTLD section of the PSL data because it will create // duplicates with existing entries alongside registry-reserved second level // domains present in the PSL data. Entries present in legacyGTLDs will not be // output by this tool when generating the new gTLD data. legacyGTLDs = map[string]bool{ "aero": true, "asia": true, "biz": true, "cat": true, "com": true, "coop": true, "info": true, "jobs": true, "mobi": true, "museum": true, "name": true, "net": true, "org": true, "post": true, "pro": true, "tel": true, "xxx": true, } // pslHeaderTemplate is a parsed text/template instance for rendering the header // before the data rendered with the pslTemplate. We use two separate templates // so that we can avoid having a variable date stamp in the pslTemplate, allowing // us to easily check that the data in the current .dat file is unchanged from // what we render when there are no updates to add. // // Expected template data: // URL - the string URL that the data was fetched from. // Date - the time.Date that the data was fetched. // DateFormat - the format string to use with the date. pslHeaderTemplate = template.Must(template.New("public-suffix-list-gtlds-header").Parse(` // List of new gTLDs imported from {{ .URL }} on {{ .Date.Format .DateFormat }} // This list is auto-generated, don't edit it manually.`)) // pslTemplate is a parsed text/template instance for rendering a list of pslEntry // objects in the format used by the public suffix list. // // It expects the following template data: // Entries - a list of pslEntry objects. pslTemplate = template.Must( template.New("public-suffix-list-gtlds").Parse(` {{- range .Entries }} {{- .Comment }} {{ printf "%s\n" .ULabel }} {{ end }}`)) ) // pslEntry is a struct matching a subset of the gTLD data fields present in // each object entry of the "GLTDs" array from ICANN_GTLD_JSON_URL. type pslEntry struct { // ALabel contains the ASCII gTLD name. For internationalized gTLDs the GTLD // field is expressed in punycode. ALabel string `json:"gTLD"` // DelegationDate holds the date the gTLD was delegated to the root zone. // A TLD should be considered dead if the delegation date is empty. DelegationDate string // ULabel contains the unicode representation of the gTLD name. When the gTLD // ULabel in the ICANN gTLD data is empty (e.g for an ASCII gTLD like // '.pizza') the PSL entry will use the ALabel as the ULabel. ULabel string // RegistryOperator holds the name of the registry operator that operates the // gTLD (may be empty). RegistryOperator string // ContractTerminated indicates whether the contract has been terminated by // ICANN. When rendered by the pslTemplate only entries with // ContractTerminated = false are included. ContractTerminated bool // RemovalDate indicates the date the gTLD delegation was removed from the // root zones. RemovalDate string } // normalize will normalize a pslEntry by mutating it in place to trim the // string fields of whitespace and by populating the ULabel with the ALabel if // the ULabel is empty. func (e *pslEntry) normalize() { e.ALabel = strings.TrimSpace(e.ALabel) e.ULabel = strings.TrimSpace(e.ULabel) e.RegistryOperator = strings.TrimSpace(e.RegistryOperator) // If there is no explicit uLabel use the gTLD as the uLabel. if e.ULabel == "" { e.ULabel = e.ALabel } } // Comment generates a comment string for the pslEntry. This string has a `//` // prefix and matches one of the following two forms. // // If the registry operator field is empty the comment will be of the form: // // // <ALabel> // // https://www.iana.org/domains/root/db/<ALabel>.html // // If the registry operator field is not empty the comment will be of the form: // // // <ALabel> : <RegistryOperator> // // https://www.iana.org/domains/root/db/<ALabel>.html func (e pslEntry) Comment() string { parts := []string{ "//", e.ALabel, } // Avoid two trailing spaces if registry operator is empty if e.RegistryOperator != "" { parts = append(parts, []string{":", e.RegistryOperator}...) } ianaUrl, err := url.JoinPath(IANA_TLD_URL_BASE, e.ALabel+".html") if err != nil { panic(fmt.Sprintf("invalid joined IANA TLD URL for %q: %v", e.ALabel, err)) } ianaUrl = "// " + ianaUrl return strings.Join([]string{ strings.Join(parts, " "), ianaUrl, }, "\n") } // gTLDDatSpan represents the span between the PSL_GTLD_SECTION_HEADER and // the PSL_GTLDS_SECTION_FOOTER in the PSL dat file. type gTLDDatSpan struct { startIndex int endIndex int } var ( errNoHeader = fmt.Errorf("did not find expected header line %q", PSL_GTLDS_SECTION_HEADER) errMultipleHeaders = fmt.Errorf("found expected header line %q more than once", PSL_GTLDS_SECTION_HEADER) errNoFooter = fmt.Errorf("did not find expected footer line %q", PSL_GTLDS_SECTION_FOOTER) ) type errInvertedSpan struct { span gTLDDatSpan } func (e errInvertedSpan) Error() string { return fmt.Sprintf( "found footer line %q before header line %q (index %d vs %d)", PSL_GTLDS_SECTION_FOOTER, PSL_GTLDS_SECTION_HEADER, e.span.endIndex, e.span.startIndex) } // validate checks that a given gTLDDatSpan is sensible. It returns an err if // the span is nil, if the start or end index haven't been set to > 0, or if the // end index is <= the the start index. func (s gTLDDatSpan) validate() error { if s.startIndex <= 0 { return errNoHeader } if s.endIndex <= 0 { return errNoFooter } if s.endIndex <= s.startIndex { return errInvertedSpan{span: s} } return nil } // datFile holds the individual lines read from the public suffix list dat file and // the span that holds the gTLD specific data section. It supports reading the // gTLD specific data, and replacing it. type datFile struct { // lines holds the datfile contents split by "\n" lines []string // gTLDSpan holds the indexes where the gTLD data can be found in lines. gTLDSpan gTLDDatSpan } type errSpanOutOfBounds struct { span gTLDDatSpan numLines int } func (e errSpanOutOfBounds) Error() string { return fmt.Sprintf( "span out of bounds: start index %d, end index %d, number of lines %d", e.span.startIndex, e.span.endIndex, e.numLines) } // validate validates the state of the datFile. It returns an error if // the gTLD span validate() returns an error, or if gTLD span endIndex is >= the // number of lines in the file. func (d datFile) validate() error { if err := d.gTLDSpan.validate(); err != nil { return err } if d.gTLDSpan.endIndex >= len(d.lines) { return errSpanOutOfBounds{span: d.gTLDSpan, numLines: len(d.lines)} } return nil } // getGTLDLines returns the lines from the dat file within the gTLD data span, // or an error if the span isn't valid for the dat file. func (d datFile) getGTLDLines() ([]string, error) { if err := d.validate(); err != nil { return nil, err } return d.lines[d.gTLDSpan.startIndex:d.gTLDSpan.endIndex], nil } // ReplaceGTLDContent updates the dat file's lines to replace the gTLD data span // with new content. func (d *datFile) ReplaceGTLDContent(content string) error { if err := d.validate(); err != nil { return err } contentLines := strings.Split(content, "\n") beforeLines := d.lines[0:d.gTLDSpan.startIndex] afterLines := d.lines[d.gTLDSpan.endIndex:] newLines := append(beforeLines, append(contentLines, afterLines...)...) // Update the span based on the new content length d.gTLDSpan.endIndex = len(beforeLines) + len(contentLines) // and update the data file lines d.lines = newLines return nil } // String returns the dat file's lines joined together. func (d datFile) String() string { return strings.Join(d.lines, "\n") } // readDatFile reads the contents of the PSL dat file from the provided path // and returns a representation holding all of the lines and the span where the gTLD // data is found within the dat file. An error is returned if the file can't be read // or if the gTLD data span can't be found or is invalid. func readDatFile(datFilePath string) (*datFile, error) { pslDatBytes, err := os.ReadFile(datFilePath) if err != nil { return nil, err } return readDatFileContent(string(pslDatBytes)) } func readDatFileContent(pslData string) (*datFile, error) { pslDatLines := strings.Split(pslData, "\n") headerIndex, footerIndex := 0, 0 for i := 0; i < len(pslDatLines); i++ { line := pslDatLines[i] if line == PSL_GTLDS_SECTION_HEADER && headerIndex == 0 { // If the line matches the header and we haven't seen the header yet, capture // the index headerIndex = i } else if line == PSL_GTLDS_SECTION_HEADER && headerIndex != 0 { // If the line matches the header and we've already seen the header return // an error. This is unexpected. return nil, errMultipleHeaders } else if line == PSL_GTLDS_SECTION_FOOTER && footerIndex == 0 { // If the line matches the footer, capture the index. We don't need // to consider the case where we've already seen a footer because we break // below when we have both a header and footer index. footerIndex = i } // Break when we have found one header and one footer. if headerIndex != 0 && footerIndex != 0 { break } } if headerIndex == 0 { return nil, errNoHeader } else if footerIndex == 0 { return nil, errNoFooter } datFile := &datFile{ lines: pslDatLines, gTLDSpan: gTLDDatSpan{ startIndex: headerIndex + 1, endIndex: footerIndex, }, } if err := datFile.validate(); err != nil { return nil, err } return datFile, nil } // getData performs a HTTP GET request to the given URL and returns the // response body bytes or returns an error. An HTTP response code other than // http.StatusOK (200) is considered to be an error. func getData(url string) ([]byte, error) { resp, err := http.Get(url) if err != nil { return nil, err } defer resp.Body.Close() if resp.StatusCode != http.StatusOK { return nil, fmt.Errorf("unexpected status code fetching data "+ "from %q : expected status %d got %d", url, http.StatusOK, resp.StatusCode) } respBody, err := io.ReadAll(resp.Body) if err != nil { return nil, err } return respBody, nil } // filterGTLDs removes entries that are present in the legacyGTLDs map or have // ContractTerminated equal to true and an empty DelegationDate, // or a non-empty RemovalDate. func filterGTLDs(entries []*pslEntry) []*pslEntry { var filtered []*pslEntry for _, entry := range entries { if _, isLegacy := legacyGTLDs[entry.ALabel]; isLegacy { continue } // If the Delegation Date is not empty, the gTLD is likely in EBERO. if entry.ContractTerminated && entry.DelegationDate == "" { continue } if entry.RemovalDate != "" { continue } filtered = append(filtered, entry) } return filtered } // getPSLEntries fetches a list of pslEntry objects (or returns an error) by: // 1. getting the raw JSON data from the provided url string. // 2. unmarshaling the JSON data to create pslEntry objects. // 3. normalizing the pslEntry objects. // 4. filtering out any legacy or contract terminated gTLDs // // If there are no pslEntry objects after unmarshaling the data in step 2 or // filtering the gTLDs in step 4 it is considered an error condition. func getPSLEntries(url string) ([]*pslEntry, error) { respBody, err := getData(url) if err != nil { return nil, err } var results struct { GTLDs []*pslEntry } if err := json.Unmarshal(respBody, &results); err != nil { return nil, fmt.Errorf( "unmarshaling ICANN gTLD JSON data: %v", err) } // We expect there to always be GTLD data. If there was none after unmarshaling // then its likely the data format has changed or something else has gone wrong. if len(results.GTLDs) == 0 { return nil, errors.New("found no gTLD information after unmarshaling") } // Normalize each tldEntry. This will remove leading/trailing whitespace and // populate the ULabel with the ALabel if the entry has no ULabel. for _, tldEntry := range results.GTLDs { tldEntry.normalize() } filtered := filterGTLDs(results.GTLDs) if len(filtered) == 0 { return nil, errors.New( "found no gTLD information after removing legacy and contract terminated gTLDs") } return filtered, nil } // renderTemplate renders the given template to the provided writer, using the // templateData, or returns an error. func renderTemplate(writer io.Writer, template *template.Template, templateData interface{}) error { var buf bytes.Buffer if err := template.Execute(&buf, templateData); err != nil { return err } _, err := writer.Write(buf.Bytes()) if err != nil { return err } return nil } // clock is a small interface that lets us mock time in unit tests. type clock interface { Now() time.Time } // realClock is an implementation of clock that uses time.Now() natively. type realClock struct{} // Now returns the current time.Time using the system clock. func (c realClock) Now() time.Time { return time.Now() } // renderHeader renders the pslHeaderTemplate to the writer or returns an error. The // provided clock instance is used for the header last update timestamp. If no // clk instance is provided realClock is used. func renderHeader(writer io.Writer, clk clock) error { if clk == nil { clk = &realClock{} } templateData := struct { URL string Date time.Time DateFormat string }{ URL: ICANN_GTLD_JSON_URL, Date: clk.Now().UTC(), DateFormat: time.RFC3339, } return renderTemplate(writer, pslHeaderTemplate, templateData) } // renderData renders the given list of pslEntry objects using the pslTemplate. // The rendered template data is written to the provided writer or an error is // returned. func renderData(writer io.Writer, entries []*pslEntry) error { templateData := struct { Entries []*pslEntry }{ Entries: entries, } return renderTemplate(writer, pslTemplate, templateData) } // Process handles updating a datFile with new gTLD content. If there are no // gTLD updates the existing dat file's contents will be returned. If there are // updates, the new updates will be spliced into place and the updated file contents // returned. func process(datFile *datFile, dataURL string, clk clock) (string, error) { // Get the lines for the gTLD data span - this includes both the header with the // date and the actual gTLD entries. spanLines, err := datFile.getGTLDLines() if err != nil { return "", err } // Render a new header for the gTLD data. var newHeaderBuf strings.Builder if err := renderHeader(&newHeaderBuf, clk); err != nil { return "", err } // Figure out how many lines the header with the dynamic date is. newHeaderLines := strings.Split(newHeaderBuf.String(), "\n") headerLen := len(newHeaderLines) // We should have at least that many lines in the existing span data. if len(spanLines) <= headerLen { return "", errors.New("gtld span data was too small, missing header?") } // The gTLD data can be found by skipping the header lines existingData := strings.Join(spanLines[headerLen:], "\n") // Fetch new PSL entries. entries, err := getPSLEntries(dataURL) if err != nil { return "", err } // Render the new gTLD PSL section with the new entries. var newDataBuf strings.Builder if err := renderData(&newDataBuf, entries); err != nil { return "", err } // If the newly rendered data doesn't match the existing data then we want to // update the dat file content by replacing the old span with the new content. if newDataBuf.String() != existingData { newContent := newHeaderBuf.String() + "\n" + newDataBuf.String() if err := datFile.ReplaceGTLDContent(newContent); err != nil { return "", err } } return datFile.String(), nil } func main() { ifErrQuit := func(err error) { if err != nil { fmt.Fprintf(os.Stderr, "error updating gTLD data: %v\n", err) os.Exit(1) } } pslDatFile := flag.String( "psl-dat-file", "public_suffix_list.dat", "file path to the public_suffix_list.dat data file to be updated with new gTLDs") overwrite := flag.Bool( "overwrite", false, "overwrite -psl-dat-file with the new data instead of printing to stdout") // Parse CLI flags. flag.Parse() // Read the existing file content and find the span that contains the gTLD data. datFile, err := readDatFile(*pslDatFile) ifErrQuit(err) // Process the dat file. content, err := process(datFile, ICANN_GTLD_JSON_URL, nil) ifErrQuit(err) // If we're not overwriting the file, print the content to stdout. if !*overwrite { fmt.Println(content) os.Exit(0) } // Otherwise print nothing to stdout and write the content over the exiting // pslDatFile path we read earlier. err = os.WriteFile(*pslDatFile, []byte(content), 0644) ifErrQuit(err) }