index.html

<!DOCTYPE html>
<html lang="en" dir="ltr">
  <head>
	<meta charset="utf-8"/>
    <title>Internationalization Best Practices for Spec Developers</title>

    <!--
      Per the W3C Manual of Style, this document avoids gender-specific pronouns.
      In this document, the terms "he", "his", and "her" appear in an example of gender-specific
      cultural norms (related to personal name handling in applications), which represents a
      well-considered exception to this rule. The word-token "he"
      appears elsewhere in this document as a language tag for the Hebrew language.
    -->
    <script src="make_checklist.js"></script>
    <script async src="https://www.w3.org/Tools/respec/respec-w3c" class="remove"></script>
    <script class="remove">
      var respecConfig = {
          // specification status (e.g. WD, LCWD, NOTE, etc.). If in doubt use ED.
          specStatus:           "ED",
          //publishDate:  		"2015-10-20",
          //previousPublishDate:  "2020-05-29",
          //previousMaturity:  	"WD",


          noRecTrack:           true,
          shortName:            "international-specs",
          copyrightStart: 		"2014",
          edDraftURI:   		"https://w3c.github.io/bp-i18n-specdev/",

          // editors, add as many as you like
          // only "name" is required
          editors:  [
              { name: "Richard Ishida", company: "W3C", w3cid: 3439 },
              { name: "Addison Phillips", w3cid: 33573 }
          ],


          group:        "i18n",
          github: 		"w3c/bp-i18n-specdev",
		  maxTocLevel: 3,
		  xref: ["i18n-glossary", "webidl"],
		  
		  postProcess: [
              async function importStyleSheet() { 
			    const elems = document.querySelectorAll(`link[rel='stylesheet'][data-import]`)
			    await Promise.all(
				   [...elems].map(async link => {
					   const text = await fetch(link.href).then(r => r.text())
					   const style = document.createElement("style")
					   style.textContent = text
					   link.replaceWith(style)
				   })
			   )
			}
		]
      };
      
    </script>
    <!-- I18N's shared stylesheet -->
	<link rel="stylesheet" data-import href="https://w3c.github.io/i18n-drafts/style/respec_2022.css">
    <!-- local styles for this document -->
	<link rel="stylesheet" href="local.css">
  
  <script>// check for changed fragment ids and route to the new id
  var fragid = location.hash
  if (fragid !== '') {
    switch (fragid) {
        case '#sec_lang_decl': location.hash = '#lang_misc'; break;
        case '#sec_lang_values': location.hash = '#lang_values'; break;
        case '#sec_lang_declaration': location.hash = '#lang_declaration'; break;
        case '#sec_dir_basic': location.hash = '#dir_misc'; break;
        case '#sec_bidi_values': location.hash = '#bidi_values'; break;
        case '#sec_bidi_markup': location.hash = '#bidi_markup'; break;
        case '#sec_bidi_strings': location.hash = '#bidi_strings'; break;
        case '#char_ref_Unicode_char': location.hash = '#char_ref'; break;
        case '#sec_resid_basic': location.hash = '#resid_misc'; break;
        case '#text_decoration': location.hash = '#typ_text_decoration'; break;
        case '#cursive': location.hash = '#typ_cursive'; break;
        case '#box_posn': location.hash = '#typ_box_posn'; break;
        }
    }
  </script>
  
  <script>
	  function sortTable(tableName, colNum, reversed) {
          var table, rows, i, x, y, shouldSwitch;
          table = document.getElementById(tableName);
          if ( ! table) return;
          var switching = true;
          var collator = new Intl.Collator('en-US');
  
          /* Loop until done */
          while (switching) {
			  switching = false;
              rows = table.rows;
              /* Loop over all rows excepting the header */
              for (i = 1; i < (rows.length - 1); i++) {
                  // Start by saying there should be no switching
                  shouldSwitch = false;
                  // Get the elements to compare
                  x = rows[i].getElementsByTagName("TD")[colNum];
                  y = rows[i + 1].getElementsByTagName("TD")[colNum];
                  
                  if (reversed) {
					  var z = x;
					  x = y;
					  y = z;
				  }

                  // Check if the two rows should switch place
                  if (collator.compare(x.textContent, y.textContent) > 0) {
                     // If so, mark as a switch and break the loop
                     shouldSwitch = true;
                     break;
                  }
               }
               if (shouldSwitch) {
                  // do the switch
                  rows[i].parentNode.insertBefore(rows[i + 1], rows[i]);
                  switching = true;
               }
           } // while
       } // sortTable
  </script>
  </head>
  
  <body onload="sortTable('exampleNamesTable', 0, false);">
  <div id="abstract">
      <p>This document provides a checklist of internationalization-related considerations when developing a specification. Most checklist items point to detailed supporting information in other documents. Where such information does not yet exist, it can be given a temporary home in this document.  <strong>The  information in this document will change regularly as new content is added and existing content is modified in the light of experience and discussion.</strong></p>
  </div>

  <div id="sotd">
  <p>This document provides advice to specification developers about how to incorporate requirements for international use. What is currently available here is expected to be useful immediately, but is still an early draft and the document is in flux, and will grow over time as knowledge applied in reviews and discussions can be crystallized into guidelines.</p>
</div>

  <section id="intro">
    <h2>Introduction</h2>
    <p>Developers of specifications need advice  to ensure that what they produce will work for communities around the globe.</p>  
    <p>The Internationalization (i18n) WG tries to assist working groups  by reviewing specifications and engaging in discussion. Often, however, such interventions come later in the process than would be ideal, or mean that the i18n WG has to repeat the same information for each working group it interacts with.</p>
    <p>It would be better if specification developers could access a checklist of best practices, which points  to explanations, examples and rationales where developers need it. Developers would then be able to build this knowledge into their work from the earliest stages, and could thereby reduce  rework needed when the i18n WG reviews their specification.</p>
    <p>This document contains the beginnings of a checklist, and points to locations where you can find explanations, examples and rationales for recommendations&nbsp;made.&nbsp; If there is no such other place, that extra information will be added to this document. It may also be used to develop ideas and organize them.</p>
    <p>The guidelines in this document are not intended to be hard and fast requirements. This document will achieve a significant part of its purpose if, where you don't understand the guidelines or disagree with them, you contact the Internationalization WG to discuss what should be done.</p>
    <p class="note">In this document, the term <a>natural language</a> is usually used to refer to the portions of a document or protocol intended for human consumption. The term <a>localizable text</a> is used to refer to the natural language content of formal languages, protocol syntaxes and the like, as distinct from <a>syntactic content</a> or <a>user-supplied values</a>. See the [[I18N-GLOSSARY]] for definitions of these and other terms used by the Internationalization Working Group.</p>


<section class="appendix" id="ghChecklist">
<h3>Create a github checklist</h3>

<p>A checklist feature is provided with this page to help you review your spec for internationalization. The results of the review should be posted to a GitHub issue.</p>

<p>Follow these steps for each section that is relevant to your spec:</p>

<ol>
<li>Open the checklist by clicking on &quot;Show the self-review checklist&quot;.</li>

<li>For each requirement that is relevant to your spec, click on the first checkbox.</li>

<li>For each requirement that your spec fulfills, click on the second checkbox. (Tip: To save time, clicking on the second checkbox will automatically turn on the first checkbox, too.)</li>

<li>When finished, click on the button &quot;Create markdown for GitHub&quot;. This will produce markdown for just the requirements that you indicated were relevant to your spec.</li>

<li>Copy the markdown code to a comment in a GitHub issue where you are capturing the results of your self-review work. If you have already done a review using the short review checklist you should copy the results produced here to other comment fields in that issue. This keeps all the review information together. Note that you'll need to repeat this copy-paste for each of the sections that contain requirements relevant to your spec.</li>

<li>Add clarification notes for the results by editing the markdown in the GitHub issue.</li>
<li>Ensure that your GitHub issue has the i18n-tracker label set, so that the Internationalization WG is aware of your review results.</li>
</ol>
</section>

<section id="I18N_Considerations">
<h3>When and how to write an <em>Internationalization Considerations</em> section in your spec</h3>

<p class="reviewComments"><a href="https://github.com/w3c/i18n-activity/labels/t%3Ai18n_considerations" target="_blank">See related review comments.</a></p>

<div class="req" id="i18n_considerations_review">
   <p class="advisement">All additions of or changes to an <cite>Internationalization Considerations</cite> section MUST be reviewed by the Internationalization (i18n) WG.</p>
</div>

<div class="req" id="i18n_considerations_title">
   <p class="advisement">If you create an internationalization considerations section, it MUST have the title <cite>Internationalization Considerations</cite> or <cite>Internationalization (i18n) Considerations</cite>.</p>
</div>

<p>Specifications are not required to include a special section or appendix describing internationalization considerations of their specification. In general, the Internationalization WG instead prefers that information about language, regional, or cultural variation, support, or adaptation appear in the body of the specification, closely associated with the relevant features.</p>

<p>However, there are a few cases in which you might consider providing a section like this. Consider including an internationalization considerations section when:
<ul>
	<li>International features require additional explanation that would otherwise interfere with or clutter-up the body of the specification.</li>
	<li>You wish to provide examples of features, such as localization, without interfering with the body of the specification. For example, summaries of the general approach taken, or factors that affect the approach taken throughout the document.</li>
	<li>There are specific limitations or problems that your specification is unable to address, such as (but not limited to) technology that is evolving but not yet ready for inclusion; limitations discovered during the horizontal review process that you intend to address in future versions; or deliberate design decisions that limit or impact certain languages, groups, or cultures.</li>
	<li>You have other information you wish to provide to adopters or implementers that doesn't fit with the remainder of your spec.</li>
</ul></p>

<p>If you decide to create an Internationalization Considerations section, it will usually be as an appendix. However, the order and placement relative to other parts of your spec or to other appendices is up to you.</p>

<p>If you decide to create an Internationalization Considerations section, you need to mention it in your horizontal review request to the Internationalization WG. The review request template includes a checkbox which allows you to do this easily.</p>

</section>
</section>


<section id="language" class="topic">
<h2>Language</h2>


<div id="language_checklist" class="summaryC"></div>


<section id="lang_misc" class="subtopic">
<h3>Language basics</h3>


<aside class="links" id="links_lang_decl">
<p class="links_title">Useful background and overviews for this section</p>
<ul>
<li class="w3"><p class="link"><a href="https://www.w3.org/International/questions/qa-lang-why">Why use the language attribute?</a>.</p></li>
<li class="w3"><p class="link"><a href="https://www.w3.org/International/questions/qa-http-and-lang">Types of language declaration</a>. How 'metadata' and 'text-processing' types of language information differ.</p></li>
<li class="w3"><p class="link"><a href="https://www.w3.org/International/articles/lang-bidi-use-cases/">Use cases for bidi and language metadata on the Web</a>.</p></li>
</ul>
</aside>


 	<div class="req" id="lang_basics_1">
	<p class="advisement">It should be possible to associate a language with any piece of <a>localizable text</a> or <a>natural language</a> content.</p>
	<details class="links"><summary>more</summary>
	<p><a href="https://www.w3.org/International/questions/qa-lang-why">Why use the language attribute?</a></p>
	<p><a href="https://www.w3.org/International/articles/lang-bidi-use-cases/">Use cases for bidi and language metadata on the Web</a></p>
	</details>
	</div>


 	<div class="req" id="lang_basics_inline">
	<p class="advisement">Where possible, there should  be a way to label <a>natural language</a> changes in inline text.</p>
	</div>
  
  <p>Text is rendered or processed differently according to the language it is in. For example, screen readers need to be prompted when a language changes, and spell checkers should be language-sensitive. When rendering text a knowledge of language is need in order to apply correct fonts, hyphenation, line-breaking, upper/lower case changes, and other features.</p>
  <p>For example, ideographic characters such as 雪, 刃, 直, 令, 垔 have slight but important differences when used with  Japanese vs Chinese fonts, and it's important not to apply a Chinese font to the Japanese text, and vice versa when it is presented to a user.</p>
  
  
   	<div class="req" id="lang_basics_meta">
	<p class="advisement">Consider whether it is useful to express the [=intended linguistic audience=] of a resource, in addition to specifying the language used for <a href="#sec_text_processing_lang">text processing</a>.</p>
	<details class="links"><summary>more</summary>
		<p><a href="https://www.w3.org/International/questions/qa-text-processing-vs-metadata">Types of language declaration</a></p>
	</details>
	</div>

  <p>Language information for a given resource can be used with two main objectives in mind: for text-processing, or as a statement of the intended use of the resource. We will explain the difference below.</p>


  <section id="sec_text_processing_lang">
  <h4>Text-processing language information</h4>
  
   	<div class="req" id="tp_lang_values">
	<p class="advisement">A language declaration that indicates the [=text-processing language=] for a range of text must associate  a single language  value with a specific range of text.</p>
	</div>
  
  <p>When specifying the <a>text-processing language</a> you are declaring the language in which <strong>a specific range of text is actually written</strong>, so that user agents or applications that manipulate the text, such as voice browsers, spell checkers,  style processors, hyphenators, etc., can apply the appropriate rules to the text in question. So we are, by necessity, talking about associating a <em>single</em> language with a <em>specific</em> range of text.</p>
    <p>It is normal to express a text-processing language as the default language, for processing the resource as a whole, but it may also be necessary to indicate where the language changes within the resource.</p>

   	<div class="req" id="lang_attribute_xml">
	<p class="advisement">Use the HTML <code class="kw" translate="no">lang</code> and XML <code class="kw" translate="no">xml:lang</code> language attributes where appropriate to identify the <a href="#sec_text_processing_lang">text processing language</a>, rather than creating a new attribute or mechanism.</p>
	</div>
  
    To identify the text-processing language for a range of text, HTML provides the <code class="kw" translate="no">lang</code> attribute, while XML provides <code class="kw" translate="no">xml:lang</code> which can be used in all XML formats. It's useful to continue using those attributes for relevant markup formats, since authors recognize them, as do HTML and XML processors.
  </section>


<section id="sec_lang_meta">
<h4>Language metadata about the resource as a whole</h4>

<p>It may also be useful to describe the language of a resource <strong>as a whole</strong>. This type of language declaration is called the <dfn>intended linguistic audience</dfn> <strong>of a resource</strong>. For example, such metadata may be used for searching, serving the right language version, classification, etc. </p>
    <p>This type of language declaration differs from that of the text-processing declaration in that (a)  the value for such declarations may be more than one  language subtag, and (b) the language value declared doesn't indicate which bits of a multilingual resource are in which language.</p>


   	<div class="req" id="metadata_lang_values">
	<p class="advisement">It should be possible to associate a  metadata-type language declaration (which indicates the intended use of the resource rather than the language of a specific range of text)  with multiple language  values.</p>
	</div>

   <p>The language(s) describing the intended use of a resource do not necessarily include every language used in a document. For example, many documents on the Web contain embedded fragments of content in different languages, whereas the page is clearly aimed at speakers of one particular language. For example, a German city-guide for Beijing may contain useful phrases in Chinese, but it is aimed at a German-speaking audience, not a Chinese one.</p>
    <p>On the other hand, it is also possible to imagine a situation where a document contains the same or parallel content in more than one language. For example, a web page may welcome Canadian readers with French content in the left column, and the same content in English in the right-hand column. Here the document is equally targeted at speakers of both languages, so there are two audience languages. Another use case is a blog or a news page aimed at a multilingual community, where some articles on a page are in one language and some in another. In this case, it may make sense to list more than one  language tag as the value of the language declaration.</p>


   	<div class="req" id="metadata_not_lang">
	<p class="advisement">Attributes that express the language of external resources should not use the HTML <code class="kw" translate="no">lang</code> and XML <code class="kw" translate="no">xml:lang</code> language attributes, but should use a different attribute when they represent metadata (which indicates the intended use of the resource rather than the language of a specific range of text).</p>
	<details class="links"><summary>more</summary>
	<p><a href="https://www.w3.org/International/questions/qa-when-xmllang"><code class="kw" translate="no">xml:lang</code> in XML document schemas</a> – When should I use xml:lang and when should I define my own element or attribute for passing language values in an XML document schema (DTD)?</p>
	</details>
	</div>

<p>Using a different attribute to indicate the language of an external resource allows the attribute to specify more than one language. It also works better if the resource pointed to is not in a single language. </p>
<p>This distinction can be seen in HTML in the separation of the <code class="kw" translate="no">lang</code> and <code class="kw" translate="no">hreflang</code> attributes. The former indicates the language of the text within the HTML page; the latter is metadata indicating the  expected language of a page that is linked to.</p>
<p>For a longer discussion of this see <a href="https://www.w3.org/International/questions/qa-when-xmllang">xml:lang in XML document schemas</a>. This article talks specifically about <code class="kw" translate="no">xml:lang</code>, but the concepts are applicable to other situations.</p>
</section>
</section>


<section id="lang_values" class="subtopic">
<h3>Defining language values</h3>

<p class="reviewComments"><a href="https://github.com/w3c/i18n-activity/labels/t%3Alang_values" target="_blank">See related review comments.</a></p>

<aside class="links" id="lang_values_links">
<p class="links_title">Useful background and overviews for this section</p>

	<ul>
          <li class="w3">
            <p class="link"><a href="https://www.w3.org/International/articles/language-tags/">Language tags in HTML and XML</a></p>
            <p class="desc">An overview of how to create language tags using BCP 47.</p>
          </li>
    <li class="nonw3">
            <p class="link"><a href="https://www.rfc-editor.org/rfc/bcp/bcp47.txt">BCP 47</a></p>
            <p class="desc">The IETF specification that defines how language tags are formed, subtags are registered, as well as how to match language tags.</p>
    </li>
	</ul>
</aside>


	<div class="req" id="lang_use_bcp47">
	<p class="advisement">Values for language declarations must use BCP 47.</p>
	<details class="links"><summary>more</summary>
    <p><a href="https://www.w3.org/International/articles/language-tags/">Language tags in HTML and XML</a></p>
	<p><a href="https://www.rfc-editor.org/rfc/bcp/bcp47.txt">BCP 47</a></p>
	</details>
	</div>

    <p>BCP 47 is the <a>language tag</a> system used by Internet and Web standards (and many other places). It defines a method of using <em>subtags</em> from an IANA registry to form a string which describes the language of content. The subtags in the registry are primarily based on (and maintain strict compatibility with) ISO and UN standards for identifying languages, scripts, regions, and countries. BCP47 also forms the basis for <a>Unicode locales</a>.</p>
    <p>For an overview of the key features of BCP 47, see <a href="https://www.w3.org/International/articles/language-tags/">Language tags in HTML and XML</a>.</p>
 

	<div class="req" id="lang_bcp_not_rfc">
	<p class="advisement">Refer to BCP 47, not to its constituent parts, such as RFC 5646 or RFC 4647.</p>
	</div>

    <p>The link to and name of BCP 47 was created specifically so that there is an unchanging reference to the definition of <cite>Tags for the Identification of Languages</cite>. RFCs 1766, 3066, 4646 were previous (superseded) versions. The current version of BCP 47 is made up of two RFCs: 5646 and 4647.</p>

	<div class="req" id="lang_values_specificity">
	<p class="advisement">Be specific about what level of conformance you expect for language tags: BCP 47 defines two levels of conformance, "valid" and "well-formed".</p>
	</div>

    <p>A <strong>well-formed</strong> BCP 47 language tag follows the syntax defined for a language tag: implementations check that each language tag consists of hyphen-separated subtags; each subtag has a specific length and specific content (letters, digits or specific combinations) depending on the placement in the tag. A <strong>valid</strong> BCP 47 language tag is well-formed but additionally ensures that only subtags that are listed in the IANA Subtag Registry are used. Note that the IANA Subtag Registry is frequently updated with new subtags.</p>


	<div class="req" id="lang_checking_well_formed">
	<p class="advisement">Specifications may require implementations to check if language tags are "valid", but in most circumstances should only require that the language tags be "well-formed".</p>
	</div>

    <p>Most specifications are second-order consumers of language metadata &ndash; they are using data already provided in the document format (HTML <span class=kw translate=no>lang</span>, XML <span class=kw translate=no>xml:lang</span>, or the document format's language fields/attributes).</p>

    <p>Generally most specifications are concerned with selecting resources (such as spell checkers, tokenizers, fonts, etc.) or with matching (selecting which string to show, for example) and don't directly care about the content of the language tag. Invalid-but-well-formed tags just don't match anything and usually fallback schemes provide some behavior that is appropriate.</p>

    <p>There might be cases where a specification really wants implementation-level checking for validity. In those cases, the result of a tag failing to be valid has to be specified (should the application die, warn the user, etc.). It's also a problem that the registry is sizeable and changes over time, so each implementation is registry-version dependent. The changes over time are often minor, but real users will encounter interoperability issues if random (out of date) implementations of the specification reject language tags that have become valid at a later date.</p>

    <p>In addition, BCP 47 has an extension mechanism which defines add-on subtag sequences. For example, one extension [[RFC6067]] (Unicode Locales, which uses the singleton <span class=kw translate=no>-u</span>), is commonly used for controlling the internationalization features of JavaScript (and has other uses). Validating these additional subtags is likely out of scope for most specifications.</p>


	<div class="req" id="lang_values_valid">
	<p class="advisement">Specifications should require content and content authors to use "valid" language tags.</p>
	</div>

    <p>Normative language regarding language tags might be different between content and implementation requirements. Specification authors need to carefully consider what conformance requirements and tests are needed for their specification and what implementations are required to do. One solution is to normatively require that "valid" language tags be used by content authors but only require implementations to check for "well-formed" language tags.</p>
 	
 	<div class="req" id="use_lstr">
		<p class="advisement">Specifications SHOULD refer to the IANA Language Subtag Registry instead of providing lists of codes extracted from ISO 639, ISO 3166, or other standards.</p>
		<details class="links"><summary>more</summary>
		  <p><a href="https://www.w3.org/International/questions/qa-choosing-language-tags">Choosing a language tag</a></p>
		  <p><a href="https://r12a.github.io/app-subtags/">Language subtag lookup tool</a></p>
		  <p><a href="https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry">IANA Language Subtag Registry</a></p>
		</details>
 	</div>
 	
 	<p>In the past, some of the standards used to provide subtags found in language tags were not freely or publicly available, so some specifications provided lists in order to help ensure interoperability. This is no longer necessary. As part of BCP 47, IANA maintains the language subtag registry, which is a publicly available, machine-readable list of valid subtags for use in constructing language tags. This registry is based on underlying standards, including the various parts of ISO 639 (639-1, 639-2, 639-3, etc.), ISO 15924 script codes, and ISO 3166 and UN M.49 region codes. The registry is actively maintained, stabilized, and comprehensive in ways that other lists found on the Internet might not be. Each of the subtag types is kept in sync with parent standards with the help and participation of those standards maintainers, so extracting or making your own list of codes or referring to ones found elsewhere can lead to maintenance problems or confusion.</p>
 	
 	<div class="req" id="avoid_lang_lists">
		<p class="advisement">Avoid creating a list of valid or supported language tags, language subtags, or [=locales=].</p>
 	</div>
 	
 	<p>Making your own list of fully formed language tags will unnecessarily restrict the list of languages that can be used. In addition, locale data is always being expanded, so a list that describes support today will become outdated in the future. Restricting which tags or subtags are available to users conflicts with our goal of providing universal access.</p>
</section>


<section id="lang_declaration" class="subtopic">
<h3>Declaring  language</h3>

<p class="reviewComments"><a href="https://github.com/w3c/i18n-activity/labels/t%3Alang_declaration" target="_blank">See related review comments.</a></p>

<section id="sec_lang_mixed">
<h4>Declaring  language at the resource level</h4>

   <p>Here we are talking about an independent unit of data that contains structured text. Examples may include a whole HTML page, an XML document, a JSON file, a WebVTT script, an annotation, etc.</p>


<aside class="links" id="links_lang_mixed_links">
<p class="links_title">Useful background and overviews for this section</p>
<ul>
<li class="w3"><p class="link"><a href="https://www.w3.org/International/questions/qa-lang-why">Why use the language attribute?</a>.</p></li>
<!--li class="w3"><p class="link"><a href="https://w3c.github.io/i18n-discuss/notes/annotation-language-use-cases">Use cases for language information in web annotations</a>. Illustrates the difference between text-processing and metadata types of language declaration.</p></li-->
<li class="w3"><p class="link"><a href="https://www.w3.org/International/questions/qa-http-and-lang">Types of language declaration</a>. How 'metadata' and 'text-processing' types of language information differ.</p></li>
<li class="w3"><p class="link"><a href="https://www.w3.org/International/articles/lang-bidi-use-cases/">Use cases for bidi and language metadata on the Web</a>.</p></li>
</ul>
</aside>


<div class="xref"><span class="seealso">See also</span>
<p>[[[#lang_values]]].</p>
</div>


	<div class="req" id="lang_whole_res">
	<p class="advisement">The specification should indicate how to define the default text-processing language for the resource as a whole.</p>
	</div>

<p>It often saves trouble to identify the language, or at least the default language, of the resource as a whole in one place. For example, in an HTML file, this is done by setting the <code class="kw" translate="no">lang</code> attribute on the <code class="kw" translate="no">html</code> element.</p>


	<div class="req" id="lang_inherit">
	<p class="advisement">Content within the resource should inherit the language of the text-processing declared at the resource level, unless it is specifically overridden.</p>
	</div>

	<div class="req" id="lang_tp_meta">
	<p class="advisement">Consider whether it is necessary to have separate declarations to indicate the text-processing language versus metadata about the expected use of the resource.</p>
	</div>

<p>In many cases a resource contains  text in only one language, and in many more cases the language declared as the default language for text-processing is the same as the language that describes the metadata about the resource as a whole. In such cases it makes sense to have a single declaration.</p>
    <p>It becomes problematic, however, to use a single declaration when it refers to more than one language unless there is a way to determine which one language should be used as the text-processing default.</p>


<div class="req" id="lang_mixing">
	<p class="advisement">If there is only one language declaration for a resource, and it has more than one language tag as a value, it must be possible to identify the default text-processing language for the resource.</p>
	</div>
</section>


<section id="lang_block">
<h4>Establishing the language of a content block</h4>

<div class="xref"><span class="seealso">See also</span>
<p>[[[#lang_values]]].</p>
</div>

    <p>The words <dfn class="lint-ignore">block</dfn> and/or <dfn class="lint-ignore">chunk</dfn> are used here to refer to a structural component within the resource as a whole that groups content together and separates it from adjacent content. Boundaries between one block and another are equivalent to paragraph or section boundaries in text, or discrete data items inside a file. </p>
    <p>For example, this could refer to a block or paragraph in XML or HTML, an object declaration in JSON, a cue in WebVTT, a line in a CSV file, etc. Contrast this with <dfn class="lint-ignore">inline</dfn> content, which describes a range within a paragraph, sentence, etc.</p>
    <p>The interpretation of which structures defined in a spec are relevant to these requirements may require a little consideration, and will depend on the format of the data involved.</p>
 
 
	<div class="req" id="lang_block_inherit">
	<p class="advisement">By default, blocks of content should inherit any text-processing language set for the resource as a whole.</p>
	</div>

  <p>See [[[#lang_misc]]] for guidance related to the default text-processing language information.</p>


	<div class="req" id="lang_block_change">
	<p class="advisement">It should be possible to indicate a change in language for blocks of content where the language changes.</p>
	</div>
</section>


<section id="lang_inline">
<h4>Establishing the language of inline runs</h4>
 
<p>In this section we refer to information that needs to be provided for a range of characters in the middle of a paragraph or string.</p>

<div class="xref"><span class="seealso">See also</span>
<p>[[[#lang_values]]]</p>
</div>


	<div class="req" id="lang_inline_spans">
	<p class="advisement">It should be possible to indicate language for spans of inline text where the language changes.</p>
	</div>

  <p>Where a switch in language can affect operations on the content, such as spell-checking, rendering, styling, voice production, translation, information retrieval, and so forth, it is necessary to indicate the range of text affected and identify the language of that content.</p>
</section>
</section>


<section id="lang_strings" class="subtopic">
<h3>Identifying the language of strings</h3>

<p class="note">The information in this section is being developed in <a href="https://www.w3.org/TR/string-meta/">Requirements for Language and Direction Metadata in Data Formats</a> [[STRING-META]]. That document is still being written, so these guidelines are likely to change at any time.</p>

<p>The exchange of data on the Web, to the degree possible, should use <a>locale-neutral</a> standardized formats. However, some data on the Web necessarily consists of <a>natural language</a> information intended for display to humans. This <a>natural language</a> information depends on and benefits from the presence of language and direction metadata for proper display. Along with support for Unicode, mechanisms for including and specifying the base direction and the natural language of spans of text are one of the key internationalization considerations when developing new formats and technologies for the Web.</p>

<p>The most basic best practice, which the Internationalization Working Group looks for in every specification, is:</p>

<div class="req" id="bp-determine">
	<p class="advisement">For any string field containing natural language text, it MUST be possible to determine the language and <a>string direction</a> of that specific string. Such determination SHOULD use metadata at the string or document level and SHOULD NOT depend on heuristics.</p>
</div>


<p class="reviewComments"><a href="https://github.com/w3c/i18n-activity/labels/t%3Alang_strings_x" target="_blank">See related review comments.</a></p>
<aside class="links" id="lang_strings_links">
<p class="links_title">Useful background and overviews for this section</p>
<ul>
<li class="w3"><p class="link"><a href="https://www.w3.org/TR/string-meta/">Strings on the Web: Language and Direction Metadata</a>.</p>
	<ul>
	<li class="w3"><p class="link"><a href="https://www.w3.org/TR/string-meta/#bp_and-reco">Best Practices, Recommendations, and Gaps</a>.</p></li>
	<li class="w3"><p class="link"><a href="https://www.w3.org/TR/string-meta/#use_cases">Requirements and Use Cases</a>.</p></li>
	<li class="w3"><p class="link"><a href="https://www.w3.org/TR/string-meta/#bidi-approaches">Approaches Considered for Identifying the Base Direction</a>.</p></li>
	</ul></p>
</aside>

<div class="xref"><span class="seealso">See also</span>
<p>[[[#bidi_strings]]].</p>
</div>

<div class="note">
	<p>Work on language and direction metadata for string formats is a work in progress. Specifications might need to include a note indicating the need for future adoption of metadata. Here is a prototype:</p>
	<p class="example_note" style="background-color:white;border:1px solid green;padding:10px">The field <code>{fieldname}</code> should follow the best practices found in <cite>Strings on the Web: Language and Direction Metadata</cite> [[STRING-META]]. This includes making use of any future standards which emerge regarding the reporting of string language and direction metadata.</p>
</div>

<div class="req" id="bp_lang_field_based_metadata">
<p class="advisement">Use field-based metadata or string datatypes to indicate the language and the [=string direction=] for individual <a>localizable text</a> values.</p>
</div>

<p>Individual data values can differ in language or direction from other values found in the same data file or document. Providing metadata values directly associated with each <a>localizable text</a> field allows for the metadata to be overridden appropriately and helps applications automate processing when assembling, extracting, forwarding, or otherwise processing each data field for use.</p>

<div class="req" id="bp_default_setting">
<p class="advisement">Specifications MAY define a mechanism to provide the default language and the default [=string direction=] for all strings in a given resource. However, specifications MUST NOT assume that a resource-wide default is sufficient.  Even if a resource-wide setting is available, it must be possible to use string-specific metadata to override that default.</p>
</div>

<p>Many documents contain data in a single language. Providing a means of indicating the intended language audience, perhaps in a header, can reduce overall document size and complexity. However, the ability to override specific string values remains important, as it is always possible that some strings might not be available in the document language or when the base direction is not consistent with the default direction of other <a>localizable text</a> values in the document as a whole.</p>

<div class="req" id="bp_default_fallback">
<p class="advisement">Specify that, in the absence of other information, the default direction and default language are unknown.</p>
</div>

<div class="req" id="bp_separate_localizable">
<p class="advisement">Specifications SHOULD be careful to distinguish <a>syntactic content</a>, including <a>user-supplied values</a>, from <a>localizable text</a>.</p>
</div>

<div class="req" id="bp_non_displayable_syntactic">
<p class="advisement">Specifications MUST NOT treat <a>syntactic content</a> values as "displayable".</p>
</div>

<div class="req" id="bp_do_not_use_language_non_data">
<p class="advisement">Specifications SHOULD NOT specify or require the use of language metadata for fields that cannot contain natural language text.</p>
</div>

<p>Document formats on the Web consist of text. In most cases, data values in a given document format are meant to be representative and meaningful, not just arbitrary strings. The fact that a data value consists of, for example, an English keyword does not make the data value a <a>natural language</a> string meant for display as text (that is, the value is not <a>localizable text</a>). Such data values are part of the <a>syntactic content</a> of the document: not only do they not require language and direction metadata, but they should not be associated with such metadata.</p>

<div class="req" id="bp_legacy_fmt_nonlang">
<p class="advisement">For string values and string fields that are <em>not</em> <a>localizable text</a>, specifications SHOULD specify that the field is non-linguistic in nature and recommend the language tag <code class="kw" translate="no">zxx</code> ("No linguistic content") be associated with each string value.</p>
</div>

<div class="req" id="bp_legacy_fmt_lang_unknown">
<p class="advisement">For string values and string fields that are known to contain <a>localizable text</a> but for which there is no possibility of language metadata from the underlying format, specifications SHOULD specify that the language of the content is unknown and recommend the language tag <code class="kw" translate="no">und</code> ("Undetermined") be associated with each string. Specifications MAY also allow the use of heuristics or the inference of the language from other field values where appropriate.</p>
</div>

<p>Some string values depend on or are defined by existing protocols or formats. Often these strings are not associated with or do not provide language or direction metadata. For example, many HTTP headers define their contents as if they were not <a>localizable text</a>, even when, in some cases, they contain natural language text. Consuming specifications sometimes need to take a dependency on strings of this nature or define a format that describes one of these strings. In these cases there will be no language or direction metadata for <a>consumers</a> to associate with the string in the specification's data structure or document format, and any metadata that the specification's data structure or document format provides (when functioning as a <a>producer</a>) will not be serialized through the underlying format.</p>

<div class="req" id="bp_unicode_tag_chars_nonuse">
<p class="advisement">Specifications SHOULD NOT use the Unicode "language tag" characters (code points <code>U+E0000</code> to <code>U+E007F</code>) for language identification.</p>
</div>

<p>The Unicode "language tag" characters are deprecated for use as language tags and there are many reasons why they are a poor solution to the language metadata problem in document formats and wire protocols. Specification authors are cautioned not to repurpose these characters or try to build new mechanisms for transmitting language information based on them.</p>

<div class="req" id="bp_language_indexing">
<p class="advisement">Specifications SHOULD recommend the use of <a>language indexing</a> when localizable strings can be supplied in multiple languages for the same value.</p>
</div>

<p><a>Producers</a> sometimes need to supply localized values for a given content item or data record. Sometimes this is done by <a>language negotiation</a> between the <a>producer</a> and <a>consumer</a>. Localization then takes place in the <a>producer</a> using the negotiated language to select the content returned.</p>
	
<p>Other times localization of a content item is done by having the <a>producer</a> return multiple language representations for the item and letting the <a>consumer</a> choose the value to display. This latter process is called <dfn>language indexing</dfn>. For more information about language indexing, see <a href="https://www.w3.org/TR/string-meta#localization-considerations"><cite>Localization Considerations</cite></a> in [[STRING-META]].</p>

<section id="lang_strings_jsonld">
<h4>Language information in JSON-LD</h4>

<aside class="links" id="lang_strings_links_jsonld">
<p class="links_title">Additional material on this sub-section's contents can be found in:</p>
<ul>
<li class="w3"><p class="link"><a href="https://www.w3.org/TR/string-meta/">Strings on the Web: Language and Direction Metadata</a>.</p>
	<ul>
	<li class="w3"><p class="link"><a href="https://www.w3.org/TR/string-meta/#technology_specific_solutions">Technology-specific solutions</a>.</p></li>
	</ul>
</aside>

<p>[[JSON-LD]] provides several mechanisms for satisfying some of the best practices found in this section:</p>
	
<div class="req" id="bp_use_jsonld_language_context">
<p class="advisement">For documents that use [[JSON-LD]], use of [[JSON-LD]] <code class="kw" translate="no">@context</code> and the built-in <code class="kw" translate="no">@language</code> attribute is RECOMMENDED as a document level default.</p>
</div>

<div class="req" id="bp_use_jsonld_i18n_namespace">
<p class="advisement">Specifications SHOULD use the <code class="kw" translate="no">i18n</code> Namespace feature for RDF literals, as defined in [[JSON-LD]] 1.1.</p>
</div>

<div class="req" id="bp_use_jsonld_atsign">
<p class="advisement">Where the <code class="kw" translate="no">i18n</code> Namespace is not available or is inappropriate to use, specifications SHOULD require [[JSON-LD]] plain string literals for natural language values to provide string-specific language information.</p>
</div>

</section>

<!-- The following is in String-Meta but probably not appropriate for us to include here yet.
<div class="req" id="bp_localizable">
<p class="advisement">For [[WebIDL]]-defined data structures, define each <a>localizable text</a> (natural language text) field as a <q><a>Localizable</a></q>.</p>
</div>
-->


</section>


<section id="lang_detection" class="subtopic">
<h3>Detecting &amp; matching language</h3>

<p class="reviewComments"><a href="https://github.com/w3c/i18n-activity/labels/t%3Alang_detection" target="_blank">See related review comments.</a></p>

 	
<aside class="issue"><p>This section is under development.</p></aside>

<div class="req" id="lang_matching_bcp">
	<p class="advisement">Reference BCP47 for language tag matching.</p>
	<details class="links"><summary>more</summary>
	<p><a href="https://www.rfc-editor.org/rfc/bcp/bcp47.txt">BCP 47</a></p>
	</details>
</div>

<p>In addition to defining language tags (in RFC 5646) BCP 47 also contains an RFC on the topic of matching language tags to a [=language range=]. Just as it is most appropriate to refer to the stable identifier BCP 47 for the definition of language tags, it is best to refer to BCP 47 when referencing matching schemes found therein.</p>

<p>Unicode's [[CLDR]] project defines additional algorithms, rules and processes for matching language tags when used as [=locale=] identifiers.</p>

</section>
</section>


<section id="text_direction" class="topic">
<h2>Text direction</h2>


<div id="text_direction_checklist" class="summaryC"></div>


<!--p>In this section:</p>
	<ul class="summary">
	<li>[[[#sec_dir_basic]]]</li>
	<li>[[[#sec_dir_background]]]</li>
	<li>[[[#sec_bidi_markup]]]</li>
	<li>[[[#sec_bidi_strings]]]</li>
	<li>[[[#bidi_inline]]]</li>
	</ul-->


  <p>It is important to establish direction for text written or mixed with right-to-left scripts. Characters in these scripts are stored in memory in the order they are typed and pronounced – called the logical order. The Unicode Bidirectional Algorithm (UBA) provides a lot of support for automatically rendering a sequence of characters stored in  logical order so that they are visually ordered as expected. Unfortunately, the UBA alone is not sufficient to correctly render bidirectional text, and additional information has to be provided about the default directional context to apply for a given sequence of characters.</p>


<!--details class="checklist" style="cursor: pointer;">
<summary onClick="showChecklist(this.parentNode.parentNode, 'dir_checklist')">Show recommendations as a checklist</summary>
<div id="dir_checklist"></div>
</details-->


 <section id="dir_misc" class="subtopic">
  <h3>Basic requirements</h3>
 <p>The basic requirements are as follows.</p>


	<div class="req" id="dir_paragraphs">
	<p class="advisement">It must be possible to indicate base direction for each individual paragraph-level item of <a>natural language</a> text that will be read by someone.</p>
	</div>
	
	<p>A special case of the above applies to [=natural language=] string values in data structures and document formats:</p>
	
	<div class="req" id="dir_strings">
	<p class="advisement">For any string field containing [=natural language=] text, it MUST be possible to determine the language and [=string direction=] of that specific string. Such determination SHOULD use metadata at the string or document level and SHOULD NOT depend on heuristics.</p>
	</div>

	<div class="req" id="dir_inline">
	<p class="advisement">It must be possible to indicate base direction changes for embedded runs of inline bidirectional text for all <a>localizable text</a>.</p>
	</div>

	<div class="req" id="dir_reasonable">
	<p class="advisement">Annotating right-to-left text must require the minimum amount of effort for people who work natively with right-to-left scripts.</p>
	</div>

    <p>Requiring a speaker of Arabic, Divehi, Hebrew, Persian, Urdu, etc. to add markup or control characters to every paragraph or small data item they write is far too much to be manageable. Typically, the format should establish a default direction and require the user to intervene only when exceptions have to be dealt with.</p>
</section>


<section id="sec_dir_background" class="subtopic">
<h3>Background information</h3>

<p>In this section we try to set out some key concepts associated with text direction, so that it will be easier to understand the recommendations that follow.</p>


<aside class="links" id="links_text_direction">
<p class="links_title">Useful background and overviews for this section</p>
<ul>
<li class="w3"><p class="link"><a href="https://www.w3.org/International/articles/inline-bidi-markup/uba-basics">Unicode Bidirectional Algorithm basics</a>.</p></li>
<li class="w3"><p class="link"><a href="https://www.w3.org/TR/string-meta">String on the Web: Language and Direction Metadata</a> [[STRING-META]]</p></li>
</ul>
</aside>


<section id="sec_dir_defs">
  <h4>Important definitions</h4>
  <p>In order to correctly display text written in a 'right-to-left' script or left-to-right text containing bidirectional elements, it is important to establish the <a href="https://www.w3.org/International/articles/inline-bidi-markup/uba-basics#context" class="termref">base direction</a> that will be used to dictate the order in which elements of the text will be displayed.</p>
  <p>If you are not familiar with what the Unicode Bidirectional Algorithm (UBA) does and doesn't do, and why the base direction is so important, read <a href="https://www.w3.org/International/articles/inline-bidi-markup/uba-basics">Unicode Bidirectional Algorithm basics</a>.</p>
  <aside class="example">
    <p>For example, the following annotation will not display correctly unless the application doing the display knows that the base direction needs to be right-to-left.</p>
    <pre>{
  "@context": "http://www.w3.org/ns/anno.jsonld",
  "id": "http://example.org/anno5",
  "type":"Annotation",
  "body": {
    "type" : "TextualBody",
    "text" : "פעילות הבינאום, W3C",
    "format" : "text/html",
    "language" : "he"
  },
  "target": "http://example.org/photo1"
}
    </pre>
    <p>You would expect the phrase in the <code class="kw" translate="no">text</code> property value to be displayed as</p>
    <p><span dir="rtl">פעילות הבינאום, W3C</span></p>
    <p>however, if there is no indication that the base direction should be right-to-left the following incorrect display will be produced:</p>
    <p>פעילות הבינאום, W3C</p>
  </aside>

  <p>In this section, the word <dfn class="lint-ignore">paragraph</dfn> indicates a run of text followed by a hard line-break in plain text, but may signify different things in other situations. In CSV it equates to 'cell', so a single line of comma-separated items is actually a set of comma-separated paragraphs.&nbsp; In HTML it equates to the lowest level of block element, which is often a <code class="kw" translate="no">p</code> element, but may be things such as <code class="kw" translate="no">div</code>, <code class="kw" translate="no">li</code>, etc., if they only contain text and/or inline elements. In JSON, it often equates to a quoted string value, but if a string value uses markup then paragraphs are associated with block elements, and if the string value is multiple lines of plain text then each line is a paragraph.</p>
  
  <aside class="note">
	  <p>While the Unicode Bidirectional Algorithm [[UAX9]] formally refers to <em>paragraphs</em> and <em>paragraph direction</em> (or the <em>base direction</em> of a paragraph), this can sometimes be confusing when the text in question is not in a long-form document. Instead, this document and others will sometimes use the terms "block direction" or especially [=string direction=] to refer to the paragraph direction of a specific string of natural language text.</p>
  </aside>
  
  <p>The term <a>metadata</a> is used here to mean information which could be an annotation or property associated with the data, or could be markup in scenarios that allow that, or could be a higher-level protocol, etc.</p>
</section>


<section id="setting_bd">
<h4>Ways base direction can be set for paragraphs</h4>
<p>There are a number of possible ways of setting the base direction.</p>
<ol>
  <li>The base direction of a paragraph may be set by an application or a user applying metadata to the paragraph.  Typical values for base direction may include <code class="kw" translate="no">ltr</code>, <code class="kw" translate="no">rtl</code> or <code class="kw" translate="no">auto</code>.
<ul>
  <li>The metadata may specifically indicate that  heuristics should be used. Then you would expect to consider the actual characters used in order to determine the base direction. (This is what happens if you set <code>dir=auto</code> on an HTML element.)</li>
  <li>The application may expect metadata, but there may be no such information provided.  In this case you would usually expect there to be a default direction specified, and the base direction for a cell would be set to that default.  The default is usually LTR. (This is what happens if you have no <code class="kw" translate="no">dir</code> attributes in your HTML file.)</li>
  <li>Where a format contains many paragraphs or chunks of information, and the language of text in all those chunks is the same, it is sometimes useful to allow a default base direction to be set for and inherited by all. This is what happens when you set the <code class="kw" translate="no">dir</code> attribute on the <code class="kw" translate="no">html</code> tag in HTML. Another example would be a subtitling file containing many cues, all written in Arabic; it would be best to allow the author to say at the start of the file that the default is RTL for all cue text. There should always be a way to override the direction information for a specific paragraph where needed.</li>
  </ul></li>
  <li>If the application expects no metadata to be available it should use heuristics to determine the base direction for each paragraph/cell. A typical solution, and one described by UAX	9 <cite>Unicode Bidirectional Algorithm</cite>, is to look for the first-strong character in the paragraph/cell.  (This is likely to apply if you are looking at plain text  that is not expected to be associated with metadata. It only happens with HTML if the direction is set to <code class="kw" translate="no">auto</code>, since HTML specifies a default direction.)
    <ul style="margin-left:0; margin-right:1em;">
  <li>Not all paragraphs using the first-strong method will have the correct base direction applied. In some cases, an Arabic or Hebrew, etc, paragraph may start with strong LTR characters. There must be a way to deal with this.</li>
  <li>Where a syntactic unit contains multiple lines of plain text (for example, a multiline cue text in a subtitling file), the first-strong heuristic needs to be applied to each line separately.</li>
  <li>There may be special rules that involve ignoring some sequence of characters or type of markup at the start of the paragraph before identifying the first strong character.</li>
  <li>In some cases there are no strong characters in a paragraph, and the base direction can be critically important for the data to be understood correctly, eg. telephone numbers or MAC addresses. There needs to be a way to resort to an appropriate default for these cases.</li>
  </ul></li>
  <li>Whether or not  any metadata is specified, if a paragraph contains a string that starts with one of the Unicode bidi control characters  RLI, LRI, FSI, LRE, RLE, LRO, or RLO and ends with PDF/PDI, these characters will determine the base direction for the contained string. These characters, when placed in the content, explicitly override any previously set direction by creating an inline range and assigning a base direction to it.
    <ul style="margin-left:0; margin-right:1em;"><li>The effect of such characters does not extend past paragraph boundaries, but the range ought to be explicitly ended using the PDF/PDI control character, especially if a paragraph end is not easily detectable by the application.)</li>
      <li>Because isolation is needed for bidirectional text to work properly, the Unicode Standard says that the isolating control codes RLI, LRI and FSI should be used rather than LRE or RLE. Unfortunately, those characters are still not widely supported.</li>
      <li>For structural components in markup, above the paragraph level, it is not possible to use the Unicode bidi control characters to define direction for  paragraphs, since these are inline controls only, and the effect is terminated by a paragraph end.</li>
      </ul>
  </li>
  </ol>
<p>When capturing text input by a user it is usually necessary to understand the context in which the user was inputting the data to determine the base direction of the input. In HTML, for example, this may be set by the direction inherited from the <code class="kw" translate="no">html</code> tag, or by the user pressing keys to set the base direction for a form field. It is then necessary to find some way of storing the information about base direction or associating it with the string when rendered. Typically, in this situation, any direction changes internal to the string being input are handled by the user and will be captured as part of the string.</p>
</section>


<section id="inline_changes">
<h4>Inline changes to base direction</h4>
<p>Embedded ranges of text <em>within</em> a single paragraph may need to have a different base direction. For example, </p>
<p>&quot;The title was '!NOITASILANOITANRETNI'.&quot;</p>
<p>where the span within the single quotes is in Hebrew/Arabic/Divehi, etc., and needs to have a [=RTL=] base direction, instead of the [=LTR=] base direction of the surrounding paragraph, in order to place the exclamation mark correctly.</p>

<p>If markup is available to the content author, it is likely to be easier and safer to use markup to indicate such inline ranges (see below). In HTML you would usually use an inline element with a <code class="kw" translate="no">dir</code> attribute to establish the base direction for such runs of text.  If you can't mark up the text, such as in HTML's <code class="kw" translate="no">title</code> element, or any environment that handles only plain text content, you have to resort to Unicode's paired control characters to establish the base direction for such an internal range.</p>

<p>Furthermore, inline ranges where the base direction is changed should be [=bidi isolated=] from surrounding text, so that the [=Unicode Bidirectional Algorithm=] doesn't produce incorrect results ("[=spillover=]") due to interference across boundaries.</p>

<aside class="example" title="Example of bidirectional interference across boundaries">
<p>Suppose an implementation assembled a string by concatenating various values together. For example, the description of a monitor attached to a system. This label might contain the brand name and model number (<kbd>Brand A123B</kbd>); resolution (<kbd>(1920 x 1080)</kbd>); size and type (<kbd>36" monitor</kbd>); as well as various features like a refresh rate of <kbd>75 Hz</kbd> or response time of <kbd>4 ms</kbd>. The resulting string in English might look like this (color has been added to make the effects more visible):</p>

<p class="spilloverExample" dir="ltr"><code>
<span style="color:red">Brand A123B</span> <span style="color:blue">(1920 x 1080)</span> <span style="color:green">36"</span> <span style="color:purple">monitor</span>, <span style="color:orange">75 Hz</span>, <span style="color:brown">4ms</span>
</code></p>

<p>If the same implementation assembling this string were on a system running in a locale that uses a right-to-left language (such as the Arabic examples shown below), the results of the same concatentation might look something like this:</p>

<p class="spilloverExample" dir="rtl"><code>
<span style="color:red">&#x0645;&#x0627;&#x0631;&#x0643;&#x0629; A123B</span> <span style="color:blue">(1920 x 1080)</span> <span style="color:green">36"</span> <span style="color:purple">&#x0634;&#x0627;&#x0634;&#x0629; &#x0627;&#x0644;&#x0643;&#x0645;&#x0628;&#x064A;&#x0648;&#x062A;&#x0631;</span>&#x60c; <span style="color:orange">75 Hz</span>&#x60c; <span style="color:brown">4 &#x0645;&#x0644;&#x0644;&#x064A; &#x062B;&#x0627;&#x0646;&#x064A;&#x0629;</span></code></p>

<p>The logical sequence of sub-strings remains the same, but the visual presentation is no longer intelligible. Notice how different parts of the description have become broken up and mixed together. The addition of isolating bidirectional controls (either markup or, when not available, Unicode control characters) to the above string produces better results:</p>

<p class="spilloverExample" dir="rtl"><code>
	<span style="color:red" dir="rtl">&#x0645;&#x0627;&#x0631;&#x0643;&#x0629; A123B</span> <span style="color:blue" dir="ltr">(1920 x 1080)</span> <span style="color:green" dir="ltr">36"</span> <span style="color:purple" dir="rtl">&#x0634;&#x0627;&#x0634;&#x0629; &#x0627;&#x0644;&#x0643;&#x0645;&#x0628;&#x064A;&#x0648;&#x062A;&#x0631;</span>&#x60c; <span style="color:orange" dir="rtl">75 Hz</span>&#x60c; <span style="color:brown" dir="rtl">4 &#x0645;&#x0644;&#x0644;&#x064A; &#x062B;&#x0627;&#x0646;&#x064A;&#x0629;</span>
</code></p>
</aside>

<p>This means that if a content author is using Unicode control codes they should use the isolating controls <code class="kw" translate="no">RLI/LRI/FSI…PDI</code> rather than the embedding controls <code class="kw" translate="no">RLE/LRE…PDF</code>.</p>

<div class="xref"><span class="seealso">See also</span>
<p><a href="https://www.w3.org/International/articles/inline-bidi-markup/bidi_examples#uc5">An example of incorrect ordering of things such as text followed by numbers in HTML</a></p>
<p><a href="https://www.w3.org/International/articles/inline-bidi-markup/bidi_examples#usecase3">An example of incorrect ordering of lists</a></p>
</div>

</section>


<section id="control_problems">
<h4>Problems with control characters</h4>
<p>Reasons to avoid relying on  control characters to set direction  include the following:</p>
<ol>
  <li>They are invisible in most editors and are therefore difficult to work with, and can easily lead to orphans and overlapping ranges.  They can be particularly difficult to manage when editing bidirectional inline text because it's hard to position the cursor in the correct place. If you ask someone who writes in a right-to-left script, you are likely to find that they dislike using control codes.</li>
  <li>Users often don't have the necessary characters available on their keyboard, or have difficulty inputting them.</li>
  <li>It is sometimes necessary to choose which to use based on context or the type of the data, and this means that a content author typically needs to select the control codes – specifying control codes in this way for all paragraphs is time-consuming and error-prone.</li>
  <li>Processors that extract parts of the data, add to it, or reuse in combination with other text  may incorrectly handle the control codes.</li>
  <li>Search and comparison algorithms should ignore these characters, but typically don't.</li>
  </ol>
<p>The last two items above may also hold for markup, but implementers often support included markup better than  included control codes.</p>
<p>Don't expect users to add control codes at the start and end of every paragraph. That's far too much work.</p>
</section>


<section id="rlmlrm">
<h4>Strong directional formatting characters: RLM, LRM, and ALM</h4>
<p>A word about the Unicode characters <span class="codepoint" translate="no"><img alt="RLM" src="images/200F.png"><code class="uname">U+200F RIGHT-TO-LEFT MARK</code></span> (RLM), <span class="codepoint" translate="no"><img alt="LRM" src="images/200E.png"><code class="uname">U+200E LEFT-TO-RIGHT MARK</code></span> (LRM), and <span class="codepoint" translate="no"><img alt="ALM" src="images/061C.png"><code class="uname">U+061C ARABIC LETTER MARK</code></span> (ALM) is warranted at this point.</p>
<p>The first point to be clear about is that these three characters do not establish the base direction for a range of text. They are simply invisible characters with strong directional properties.</p>
<p>This means that you cannot use RLM for example, to make the text <kbd>W3C</kbd> appear to the left of the Hebrew text in the following example.</p>
<p>The title is "<span dir="rtl" lang="he">פעילות הבינאום, W3C</span>".</p>
<p>For this you can only use metadata or the paired control characters.</p>
<p>Of course, if you are detecting base direction using first-strong heuristics (such as <code>dir="auto"</code> in HTML), then inserting an RLM, ALM, or LRM can be useful for influencing the base direction detected where the text in question begins with something that would otherwise give the wrong result. For example:</p>
<p>"<span dir="rtl" lang="ar">نشاط التدويل</span>" is how you say "i18n Activity" in Arabic.</p>
<p>Here an LRM could be placed at the start of the text, before the strong right-to-left Arabic characters, to prevent the algorithm from assuming that the text should be right-to-left. (Remember that if metadata is used to set the base direction, the strong directional formatting character is ignored, unless the metadata specifically says that first-strong heuristics should be used.)</p>
<p>Finally, a note about the use of <span class="codepoint" translate="no"><img alt="ALM" src="images/061C.png"><code class="uname">U+061C ARABIC LETTER MARK</code></span> (ALM). This character is used to influence the display of sequences of numbers in Arabic script text in cases where no Arabic letters occur before the number.</p>
<aside class="example" title="Example of ALM usage">
   <p>In some Arabic-script languages the range <code dir="rtl">100-200</code> should appear as <code dir="rtl">&#x061c;100-200</code>. If no Arabic letters appear before the numbers, the [=Unicode Bidirectional Algorithm=] will not perform this reordering. Note that the character sequences in both cases is "100-200" and that both have a <kbd>code</kbd> element with a <code>dir="rtl"</code> around them.  In the third example, an ALM is used to provide the necessary hint, like so:</p>
   <table>
	   <thead>
		   <tr><th>Description</th><th>HTML / Appearance</th></tr>
	   </thead>
	   <tbody>
		   	<tr>
			   <td rowspan="2">Preceded by Arabic letters</td>
			   <td><pre class="html">&lt;code dir="rtl" lang="ar"&gt;&#x0634;&#x0627;&#x0637; &#x0627;&#x0644;&#x062A;&#x062F;&#x0648;&#x064A;&#x0644; 100-200&lt;/code&gt;</pre></td>
			</tr><tr>
			   <td dir="rtl" class="spilloverExample"><code dir="rtl" lang="ar">&#x0634;&#x0627;&#x0637; &#x0627;&#x0644;&#x062A;&#x062F;&#x0648;&#x064A;&#x0644; 100-200</code></td>
		   </tr>
		   <tr>
			   <td rowspan="2">Without ALM</td>
			   <td><pre class="html">&lt;code dir="rtl" lang="ar"&gt100-200&lt;/code&gt;</pre></td>
		   </tr><tr>
			   <td dir="rtl" class="spilloverExample"><code dir="rtl" lang="ar">100-200</code></td>
		   </tr>
		   <tr>
			   <td rowspan="2">With ALM</td>
			   <td><pre class="html">&lt;code dir="rtl" lang="ar"&gt&amp;#x061C;100-200&lt;/code&gt;</pre></td>
		   </tr><tr>
			   <td dir="rtl" class="spilloverExample"><code dir="rtl" lang="ar" >&#x061C;100-200</code></td>
		   </tr>
	   </tbody>
   </table>
</aside>
</section>


<section id="bd_language">
<h4>Base direction and language</h4>

 	<div class="req" id="bidi_lang">
	<p class="advisement">Do not assume that direction can be determined from language information.</p>
	<details class="links"><summary>more</summary>
	<p><a href="https://www.w3.org/International/questions/qa-direction-from-language">Can we derive base direction from language?</a>, W3C article.</p>
	</details>
	</div>

    <p>The following are all reasons you cannot use language tags to provide information about base direction:</p>
    <ol>
    <li>you can't produce the <code class="kw" translate="no">auto</code> value with language tags.</li>
    <li>some languages are written with both RTL and LTR scripts.</li>
    <li>the only reliable part of the language tag that would indicate the base direction is the script tag, but BCP47 recommends that you suppress the use of the script tag for languages that don't usually need it, such as  Hebrew (<code translate="no">Suppress-Script: Hebr</code>). Languages, such as Persian, that are usually written in a RTL script may be written in transcribed form, and it's not possible to guarantee that the necessary script tag would be present to carry the directional information. In summary, you won't be able to rely on people supplying script tags as part of the language information in order to influence direction.</li>
    <li>the incidence of use of language tags and base direction markers  often don't coincide.</li>
    <li>they are not semantically equivalent.</li>
    </ol>
</section>
</section>


<section id="bidi_values" class="subtopic">
<h3>Base direction values</h3>

<p class="reviewComments"><a href="https://github.com/w3c/i18n-activity/labels/t%3Abidi_values" target="_blank">See related review comments.</a></p>

	<div class="req" id="bidi_values_req">
	<p class="advisement">Values for the default base direction should include left-to-right, right-to-left, and auto.</p>
	</div>


<p>The <code class="kw" translate="no">auto</code> value allows automatic detection of the base direction for a piece of text. For example, the <code class="kw" translate="no">auto</code> value of <code class="kw" translate="no">dir</code> in HTML looks for the first strong directional character in the text, but ignores certain items of markup also, to guess the base direction of the text. Note that automatic detection algorithms are far from perfect. First-strong detection is unable to correctly identify text that is really right-to-left, but that begins with a strong LTR character. Algorithms that attempt to judge the base direction based on contents of the text are also problematic. The best scenario is one where the base direction is known and declared.</p>
</section>


<section id="bidi_markup" class="subtopic">
<h3>Handling direction in markup</h3>


<p>This section is about defining approaches to bidi handling that work with resources that organize content using markup.  Some of the recommendations are different from those for handling strings on the Web (see [[[#bidi_strings]]]).</p>


<p class="reviewComments"><a href="https://github.com/w3c/i18n-activity/labels/t%3Abidi_markup" target="_blank">See related review comments.</a></p>

<aside class="links" id="links_bidi_markup">
<p class="links_title">Useful background and overviews for this section</p>
<ul>
<li class="w3"><p class="link"><a href="https://www.w3.org/TR/html-bidi/">Additional Requirements for Bidi in HTML &amp; CSS</a>.</p></li>
</ul>
</aside>


<section id="sec_default_base">
<h4>Setting the default base direction</h4>

	<div class="req" id="bidi_whole_res">
	<p class="advisement">The spec should  indicate how to define a default base direction for the resource as a whole, ie. set the overall base direction.</p>
	</div>


	<div class="req" id="bidi_res_default">
	<p class="advisement">The  default base direction, in the absence of other information, should be <span class="kw" translate="no">auto</span>.</p>
	</div>
</section>


  <section id="bidi_block">
<h4>Establishing the base direction for paragraphs</h4>

 	<div class="req" id="bidi_block_change">
	<p class="advisement">The content author must be able to indicate parts of the text where the base direction changes. At the block level, this should be achieved using attributes or metadata, and should not require the content author to use Unicode control characters to control direction.</p>
	</div>


  <p>Relying on Unicode control characters to establish direction for every block is not feasible because line breaks terminate the effect of such control characters. It also makes the data much less stable, and unnecessarily difficult to manage if control characters have to appear at every point where they would be needed.</p>

 	<div class="req" id="bidi_block_auto">
	<p class="advisement">It must be possible to also set the direction for content fragments to <code class="kw" translate="no">auto</code>. This means that the base direction will be determined by examining the content itself.</p>
	<details class="links"><summary>more</summary>
	<p><a href="https://www.w3.org/TR/html-bidi/#auto-direction-algorithms">Estimation algorithms</a>, in <cite>Additional Requirements for Bidi in HTML &amp; CSS</cite>.</p>
	</details>
	</div>

  <p>A typical approach here would be to set the direction based on the first strong directional character outside of any markup, but this is not the only possible method. The algorithm used to determine directionality when direction is set to auto should match that expected by the receiver.</p>
  <p>The first-strong algorithm looks for the first character in the paragraph with a strong directional property according to the Unicode definitions. It then sets the base direction of the paragraph according to the direction of that character.</p>
  <p>Note that the first-strong algorithm may incorrectly&nbsp;guess the direction of the paragraph when the first character is not typical of the rest of the paragraph, such as when a RTL paragraph or line starts with a LTR brand name or technical term.</p>
  <p>For additional information about algorithms for detecting direction, see <a href="https://www.w3.org/TR/html-bidi/#auto-direction-algorithms">Estimation algorithms</a> in the document where this was discussed with reference to HTML.</p>


	<div class="req" id="bidi_block_para">
	<p class="advisement">If the overall base direction is set to <code class="kw" translate="no">auto</code> for plain text, the direction of content paragraphs should be determined on a paragraph by paragraph basis.</p>
	</div>


	<div class="req" id="bidi_block_befaft">
	<p class="advisement">To indicate the sides of a block of text  relative to the start and end of its contained lines,  use 'block-start' and 'block-end', rather than 'top' and 'bottom'.</p>
	<details class="links"><summary>more</summary>
	<p><a href="https://drafts.csswg.org/css-logical/">CSS Logical Properties and Values Level 1</a></p>
	</details>
  </div>


	<div class="req" id="bidi_inline_start_end">
	<p class="advisement">To indicate the start/end of a line  you should use 'start' and 'end', or 'inline-start' and 'inline-end', rather than 'left' and 'right'.</p>
	<details class="links"><summary>more</summary>
	<p><a href="https://drafts.csswg.org/css-logical/">CSS Logical Properties and Values Level 1</a></p>
	</details>
	</div>


	<div class="req" id="bidi_dedicated_attr">
	<p class="advisement">Provide dedicated attributes for control of base direction and bidirectional overrides; do not rely on the user applying style properties to arbitrary markup to achieve bidi control.</p>
	<details class="links"><summary>more</summary>
	<p><a href="https://www.w3.org/International/questions/qa-bidi-css-markup.en">CSS vs. markup for bidi support</a>, W3C article.</p>
	</details>
	</div>

  <p>For example, HTML has a <code class="kw" translate="no">dir</code> attribute that is capable of managing base direction without assistance from CSS styling. XML formats should define dedicated markup to represent directional information, even if they need CSS to achieve the required display, since the text may be used in other ways.</p>
  <p>Style sheets such as CSS may not always be used with the data, or carried with the data when it is syndicated, etc. Directional information is fundamentally important to correct display of the data, and should be associated more closely and more permanently with the markup or data.</p>
</section>
</section>


<section id="bidi_strings" class="subtopic">
<h3>Handling base direction for strings</h3>

<p class="note">The information in this section is pulled from <a href="https://www.w3.org/TR/string-meta/">Strings on the Web: Language and Direction Metadata</a>. That document is still being written, so these guidelines are likely to change at any time.</p>

<p class="reviewComments"><a href="https://github.com/w3c/i18n-activity/labels/t%3Abidi_strings" target="_blank">See related review comments.</a></p>

<aside class="links" id="links_bidi_strings">
<p class="links_title">Useful background and overviews for this section</p>
<ul>
<li class="w3"><p class="link"><a href="https://www.w3.org/TR/string-meta/">Strings on the Web: Language and Direction Metadata</a>.</p>
	<ul>
	<li class="w3"><p class="link"><a href="https://www.w3.org/TR/string-meta/#bp_and-reco">Best Practices, Recommendations, and Gaps</a>.</p></li>
	<li class="w3"><p class="link"><a href="https://www.w3.org/TR/string-meta/#use_cases">Requirements and Use Cases</a>.</p></li>
	<li class="w3"><p class="link"><a href="https://www.w3.org/TR/string-meta/#bidi-approaches">Approaches Considered for Identifying the Base Direction</a>.</p></li>
	</ul>
</li>
</ul>
</aside>


	<div class="req" id="bidi_strings_metadata">
	<p class="advisement">Provide metadata constructs that can be used to indicate the base direction of any <a>natural language</a> string.</p>
	<details class="links"><summary>more</summary>
	<p><a href="https://www.w3.org/TR/string-meta/#bp_and-reco">Best Practices, Recommendations, and Gaps</a>, in <cite>Strings on the Web: Language and Direction Metadata</cite></p>
	</details>
	</div>

	<div class="req" id="bidi_strings_heuristics">
	<p class="advisement">Specify that consumers of strings should use  heuristics, preferably based on the Unicode Standard first-strong algorithm, to detect the base direction of a string except where metadata is provided.</p>
	<details class="links"><summary>more</summary>
	<p><a href="https://www.w3.org/TR/string-meta/#bp_and-reco">Best Practices, Recommendations, and Gaps</a>, in <cite>Strings on the Web: Language and Direction Metadata</cite></p>
	</details>
	</div>

	<div class="req" id="bidi_strings_default">
	<p class="advisement">Where possible, define a field to  indicate the default direction for all strings in a given resource or document.</p>
	<details class="links"><summary>more</summary>
	<p><a href="https://www.w3.org/TR/string-meta/#bp_and-reco">Best Practices, Recommendations, and Gaps</a>, in <cite>Strings on the Web: Language and Direction Metadata</cite></p>
	</details>
	</div>

	<div class="req" id="bidi_strings_metadata_and_default">
	<p class="advisement">Do NOT assume that a creating a document-level default without the ability to change direction for any string is sufficient.</p>
	<details class="links"><summary>more</summary>
	<p><a href="https://www.w3.org/TR/string-meta/#bp_and-reco">Best Practices, Recommendations, and Gaps</a>, in <cite>Strings on the Web: Language and Direction Metadata</cite></p>
	</details>
	</div>

	<div class="req" id="bidi_strings_script_tag">
	<p class="advisement">If metadata is not available due to legacy implementations and cannot otherwise be provided, specifications MAY allow a [=string direction=] to be interpolated from available language metadata.</p>
	<details class="links"><summary>more</summary>
	<p><a href="https://www.w3.org/TR/string-meta/#bp_and-reco">Best Practices, Recommendations, and Gaps</a>, in <cite>Strings on the Web: Language and Direction Metadata</cite></p>
	</details>
	</div>

	<div class="req" id="bidi_strings_paired_controls">
	<p class="advisement">Specifications MUST NOT require the production or use of paired bidi controls.</p>
	<details class="links"><summary>more</summary>
	<p><a href="https://www.w3.org/TR/string-meta/#bp_and-reco">Best Practices, Recommendations, and Gaps</a>, in <cite>Strings on the Web: Language and Direction Metadata</cite></p>
	</details>
	</div>

</section>


<section id="bidi_inline" class="subtopic">
<h3>Setting base direction for inline or substring text</h3>

<p>'Inline text' here has a readily understandable meaning in markup. It also applies to strings (eg. in JSON, CSV, or other plain text formats), meaning runs of characters which don't include all the characters in the string.</p>
 
 
 	<div class="req" id="bidi_inline_change">
	<p class="advisement">It must be possible to indicate spans of inline text where the base direction changes. If markup is available, this is the preferred method. Otherwise your specification must require that Unicode control characters are recognized by the receiving application, and correctly implemented.</p>
	</div>

 	<div class="req" id="bidi_inline_auto">
	<p class="advisement">It must be possible to also set the direction for a span of inline text to <code class="kw" translate="no">auto</code>, which means that the base direction will be determined by examining the content itself. A typical approach here would be to set the direction based on the first strong directional character outside of any markup.</p>
	</div>

    <p>The first-strong algorithm looks for the first character in the paragraph with a strong directional property according to the Unicode definitions. It then sets the [=base direction=] of the paragraph according to the direction of that character.</p>
    
    <p>Note that the first-strong algorithm may incorrectly guess the direction of the paragraph when the first character is not typical of the rest of the paragraph, such as when an [=RTL=] paragraph or line starts with a [=LTR=] brand name or technical term.</p>
    
    <p>For additional information about algorithms for detecting direction, see <a href="https://www.w3.org/TR/html-bidi/#auto-direction-algorithms">Estimation algorithms</a> in the document where this was discussed with reference to HTML.</p>


 	<div class="req" id="bidi_inline_rli">
	<p class="advisement">If users use Unicode bidirectional control characters, the isolating RLI/LRI/FSI with PDI characters must be supported by the application and recommended (rather than RLE/LRE with PDF) by the spec.</p>
	</div>

 	<div class="req" id="bidi_inline_rlm">
	<p class="advisement">Use of RLM/LRM should be appropriate, and expectations of what those controls can and cannot do should be clear in the spec.</p>
	</div>

    <p>The Unicode bidirectional control characters <span class="codepoint" translate="no"><img alt="RLM" src="images/200F.png"><code class="uname">U+200F RIGHT-TO-LEFT MARK</code></span> and <span class="codepoint" translate="no"><img alt="LRM" src="images/200E.png"><code class="uname">U+200E LEFT-TO-RIGHT MARK</code></span> are not sufficient on their own to manage bidirectional text. They cannot produce a different base direction for embedded text. For that you need to be able to indicate the start and end of the range of the embedded text.&nbsp; This is best done by markup, if available, or failing that using the other Unicode bidirectional controls mentioned just above.</p>


 	<div class="req" id="bidi_inline_dedicated_attr">
	<p class="advisement">For markup, provide dedicated attributes for control of base direction and bidirectional overrides; do not rely on the user applying style properties to arbitrary markup to achieve bidi control.</p>
	</div>

 	<div class="req" id="bidi_inline_all_elems">
	<p class="advisement">For markup, allow bidi attributes on all inline elements in markup that contain text.</p>
	</div>

 	<div class="req" id="bidi_inline_embed">
	<p class="advisement">For markup, provide attributes that allow the user to (a) create an isolated or embedded base direction or (b) override the bidirectional algorithm altogether. Such attributes should allow the user to set the direction to LTR, RTL, or Auto in either of these two scenarios.</p>
	</div>
</section>


<section id="bidi_detection" class="subtopic">
<h3><em>Detecting &amp; matching direction (TBD)</em></h3>

<p class="reviewComments"><a href="https://github.com/w3c/i18n-activity/labels/t%3Abidi_detection_x" target="_blank">See related review comments.</a></p>
</section>

</section>


<section id="characters" class="topic">
<h2>Characters</h2>


<!--p>In this section:</p>
	<ul class="summary">
	<li>[[[#char_def]]]</li>
	<li>[[[#char_referencemodel]]]</li>
	<li>[[[#char_ranges]]]</li>
	<li>[[[#char_pua]]]</li>
	<li>[[[#char_choosing]]]</li>
	<li>[[[#char_identifying]]]</li>
	<li>[[[#char_escapes]]]</li>
	<li>[[[#char_storing]]]</li>
	<li>[[[#char_string]]]</li>
	<li>[[[#char_ref_Unicode_char]]]</li>
	<li>[[[#char_unicoderef]]]</li>
	</ul-->

<div id="characters_checklist" class="summaryC"></div>


<!--details class="checklist" style="cursor: pointer;">
<summary onClick="showChecklist(this.parentNode.parentNode, 'char_checklist')">Show recommendations as a checklist</summary>
<div id="char_checklist"></div>
</details-->


<aside class="links" id="links_lang_mixed">
<p class="links_title">Useful background and overviews for this section</p>
<ul>
<li class="w3"><p class="link"><a href="https://www.w3.org/TR/charmod/">Character Model for the World Wide Web: Fundamentals</a> — basic guidelines related to the use of characters and encodings.</p></li>
<li class="notw3"><p class="link"><a href="https://encoding.spec.whatwg.org/">Encoding</a> — specification with further guidelines related to use of character encodings.</p></li>
<li class="w3"><p class="link"><a href="https://www.w3.org/TR/charmod-norm/">Character Model for the World Wide Web: String Matching</a> — issues that arise when you try to compare two strings, be it identifiers or authored content.</p></li>
</ul>
</aside>


    <p>The term <em>character</em> is often used to mean different things in different contexts: it can variously refer to the visual, logical, or byte-level representation of a given piece of text. This makes the term too imprecise to use when specifying algorithms, protocols, or document formats. Understanding how characters are defined and encoded in computing systems, along with the associated terminology used to make such specification unambiguous, is thus a necessary prerequisite to discussing the processing of string data.</p>

    <p>The visual manifestation of a "character"&mdash;the shape most people mean when they say "character"&mdash;is what we call a <dfn class="lint-ignore">user-perceived character</dfn>. These visual building blocks are usually perceived to be a single unit of the visible text.</p>

    <p>At their simplest, user-perceived characters are a single shape that can be tied one-to-one to the underlying computing representation. But a user-perceived character can be formed, in some scripts, from more than one character. And a given logical character can take many different shapes due to such influences as font selection, style, or the surrounding context (such as adjacent characters). In some cases, a single user-perceived character might be formed from a long sequence of logical characters. And some logical characters (so-called "combining marks") are always used in conjunction with another character.</p>

    <p>When user-perceived characters are represented visibly (on screen or in print), they are represented by individual rendering units. This visual unit is called a <a>grapheme</a> (the word <a>glyph</a> is also used). Graphemes are the visual units found in fonts and rendering software.</p>

    <aside class=example>
		<h5>Examples of user-perceived characters</h5>
		<p>Here is the word for "Unicode" in the Latin, Katakana, Arabic, and Devanagari scripts.</p>
		<p class=bigtext>Unicode
		   <span lang=ja>&#x30E6;&#x30CB;&#x30B3;&#x30FC;&#x30C9;</span>
		   <span lang=ar dir=rtl>&#x064A;&#x0648;&#x0646;&#x064A;&#x0643;&#x0648;</span>
		   <span lang=hi>&#x092F;&#x0942;&#x0928;&#x093F;&#x0915;&#x094B;&#x0921;</span>
		</p>
		<p>The last example, in the Devanagari script (which is used to write Hindi, among other languages) is made up of four graphemes:</p>
		<p class=bigtext lang=hi>&#x092F;&#x0942;&nbsp;&#x0928;&#x093F;&nbsp;&#x0915;&#x094B;&nbsp;&#x0921;</p>
    </aside>

    <p>Graphemes are encoded into computer systems using "logical characters". A <a>character set</a> is a set of logical characters: a specific collection of characters that can be used together to encode text. The most important character set is the <a>Universal Character Set</a>, also known as [[Unicode]]. This character set includes all of the characters used to encode text, including historical or extinct writing systems as well as modern usage, private use, typesetting symbols, and other things, such as the emoji. Other character sets are defined subsets of Unicode. In Unicode, a 'character' is a single abstract logical unit of text. Each character in Unicode is assigned a unique integer number between <code>0x0000</code> and <code>0x10FFFF</code>, which is called its <a>code point</a>. The term <a>code point</a> therefore unambiguously refers to a single logical character and its integer representation.</p>

	<div class="req" id="char_term_def">
	<p class="advisement">Specifications SHOULD explicitly define the term 'character' to mean a Unicode code point.</p>
	</div>

    <p>The relationship between code points and graphemes can be complex. In most cases, a code point sequence that forms a single grapheme should be treated as a single textual unit. For example, when cursoring across text, an entire grapheme should select together. It shouldn't be possible to cursor into the "middle" of a grapheme or delete only a part of user-perceived character. Because the relationship is not one-to-one between code points and graphemes and because the relationship can be somewhat complex, [[Unicode]] defines a specific type of grapheme: the <a>extended grapheme cluster</a> which most closely matches the mapping of the underlying logical character sequence to a user-perceived character. When referring to 'graphemes' in this document, we mean extended grapheme clusters (unless otherwise called out).</p>

    <aside class=example>
		<h5>Hindi example showing mapping from graphemes to code points</h5>
		<p>Returning to the example above, the Hindi word for Unicode is made of four graphemes:</p>
		<p class=bigtext lang=hi>&#x092F;&#x0942;&nbsp;<span style="color:red">&#x0928;&#x093F;</span>&nbsp;&#x0915;&#x094B;&nbsp;&#x0921;</p>
		<p>Several of these graphemes are made up of more than one Unicode character because of the way that the Devanagari script works. In Devanagari, the basic set of "letters" are syllables ending with the short 'a' vowel sound. When you want to use a different vowel, you add a combining vowel character that changes the shape of the grapheme. The red text in the example above is the syllable "ni" in "Unicode". It is made of two characters: U+0928 (the syllable "na") and U+093F (combining "short i" sound):</p>

		<table>
			<tr>
				<td class=bigtext>&#x092f;</td>
				<td class=bigtext>&#x0942;</td>
				<td class=bigtext style="color:red">&#x0928;</td>
				<td class=bigtext style="color:red">&#x093f;</td>
				<td class=bigtext>&#x0915;</td>
				<td class=bigtext>&#x094b;</td>
				<td class=bigtext>&#x0921;</td>
			</tr>
			<tr>
				<td><code>U+092F</code></td>
				<td><code>U+0942</code></td>
				<td><code>U+0928</code></td>
				<td><code>U+093F</code></td>
				<td><code>U+0915</code></td>
				<td><code>U+094B</code></td>
				<td><code>U+0921</code></td>
			</tr>
		</table>

    </aside>

    <p>Another example of the complex relationship between code points and graphemes are certain emoji. The emoji character for "family" has a code point in Unicode: <span class="codepoint" translate="no"><bdi lang="en">&#x1F46A;</bdi><code class="uname">U+1F46A FAMILY</code></span>. It can also be formed by using using a sequence of code points: <code class="uname">U+1F468 U+200D U+1F469 U+200D U+1F466</code>. Altering or adding other emoji characters can alter the composition of the family. For example the sequence <span class="codepoint" translate="no"><bdi translate="no">&#x1f468;&#x200d;&#x1f469;&#x200d;&#x1f467;&#x200d;&#x1f467;</bdi><code class="uname" translate="no">U+1F468 U+200D U+1F469 U+200D U+1F467 U+200D U+1F467</code></span> results in a composed emoji character for a "family: man, woman, girl, girl" on systems that support this kind of composition. Many common emoji can <em>only</em> be formed using sequences of code points, but should be treated as a single user-perceived character when displaying or processing the text. You wouldn't want to put a line-break in the middle of the family!</p>

    <p>Unicode code points are just abstract integer values: they are not the values actually present in the memory of the computer or serialized on the wire. When processing text, computers use an array of fixed-size integer units. One such common unit is the <dfn class="lint-ignore">byte</dfn> (or <em>octet</em>, since bytes have 8 bits per unit). There are also 16-bit, 32-bit, or other size units. In many programming languages, the unit is called a <code>char</code>, which suggests that strings are made of "characters". We use the term <a>code unit</a> to refer unambiguously to the programming and serialized representation of characters. For example, in C, a <code>char</code> is generally an 8-bit byte: each <code>char</code> is a 8-bit code unit. In Java or Javascript, a <code>char</code> is a 16-bit value.</p>

    <p>A set of rules for converting code points to or from code units is called a <a>character encoding form</a> (or just "character encoding" for short.</p>

    <aside class=example>
       <h2>UTF-8 Character Encoding Form</h2>

       <p>The most common character encoding used on the Web is UTF-8. UTF-8 uses 8-bit bytes as its code unit. Each Unicode code point encoded into UTF-8 takes between one and four bytes to encode. ASCII characters take one byte to encode. Code points from 0x80 to 0x7FF take two bytes. Code points from 0x800 to 0xFFFF take three bytes. And code points from 0x10000 to 0x10FFFF (that is, the rest of Unicode) take four bytes each.</p>

       <table class=cpExample>
		   <tr>
			   <td>Grapheme</td>
			   <td class=bigtext>A</td>
			   <td class=bigtext>&#xc0;</td>
		   </tr>
		   <tr>
			   <td>Code Point</td>
			   <td><code>U+0041</code></td>
			   <td><code>U+00C0</code></td>
		   </tr>
		   <tr>
			   <td>Code Units (bytes)</td>
			   <td><code>0x41</code></td>
			   <td><code>0xC3 0x80</code></td>
		   </tr>
		   <tr>
			   <td>Grapheme</td>
			   <td class=bigtext>&#x928;</td>
			   <td class=bigtext>&#x1F46A;</td>
		   </tr>
		   <tr>
			   <td>Code Point</td>
			   <td><code>U+0928</code></td>
			   <td><code>U+1F46A</code></td>
		   </tr>
		   <tr>
			   <td>Code Units (bytes)</td>
			   <td><code>0xE0 0xA4 0xA8</code></td>
			   <td><code>0xF0 0x9F 0x91 0xAA</code></td>
		   </tr>
       </table>
    </aside>


<section id="char_def" class="subtopic">
<h3>Choosing a definition of 'character'</h3>

<p class="reviewComments"><a href="https://github.com/w3c/i18n-activity/labels/t%3Achar_def" target="_blank">See related review comments.</a></p>

<p>The term character is used differently in a variety of contexts and often leads to confusion when used outside of these contexts. In the context of the digital representations of text, a character can be defined as a small logical unit of text. Text is then defined as sequences of characters. While such an informal definition is sufficient to create or capture a common understanding in many cases, it is also sufficiently open to create misunderstandings as soon as details start to matter. In order to write effective specifications, protocol implementations, and software for end users, it is very important to understand that these misunderstandings can occur.</p>

<p>This section examines some of these contexts, meanings and confusions.</p>


<div class="xref"><span class="seealso">See also</span>
<p>[[[#char_string]]].</p>
</div>

<aside class="links" id="links_char_def">
<p class="links_title">Useful background and overviews for this section</p>
<ul>
<li class="w3"><p class="link"><a href="https://www.w3.org/TR/charmod/#sec-Perceptions">Perceptions of Characters</a>, in <cite>Character Model for the World Wide Web: Fundamentals</cite>.</p></li>
</ul>
</aside>


	<div class="req" id="char_specific">
	<p class="advisement">Specifications SHOULD use specific terms, when available, instead of the general term 'character'.</p>
	<details class="links"><summary>source</summary>
	<p><a href="https://www.w3.org/TR/charmod/#sec-PerceptionsOutro">Perceptions of Characters, Summary C067</a>, in <cite>Character Model for the World Wide Web 1.0: Fundamentals</cite>.</p>
	</details>
	</div>


<p>Specific terms could include [=code point=], [=grapheme cluster=], <a href="https://www.w3.org/TR/css-text-3/#typographic-character-unit">typographic character unit</a>, [=code unit=], and [=glyph=].</p>


<div class="req" id="char_define">
	<p class="advisement">When specifications use the term 'character' the specifications MUST define which meaning they intend, and  SHOULD explicitly define the term 'character' to mean a Unicode code point.</p>
	<details class="links"><summary>source</summary>
	<p><a href="https://www.w3.org/TR/charmod/#sec-PerceptionsOutro">Perceptions of Characters, Summary C010</a>, in <cite>Character Model for the World Wide Web 1.0: Fundamentals</cite></p>
	</details>
  </div>

<p>The developers of specifications, and the developers of software based on those specifications, are likely to be more familiar with usages of the term 'character' they have experienced and less familiar with the wide variety of usages in an international context. Furthermore, within a computing context, characters are often confused with related concepts, resulting in incomplete or inappropriate specifications and software.</p>

	<div class="req" id="char_physical_storage">
	<p class="advisement">Specifications, software and content MUST NOT require or depend on a one-to-one relationship between characters and units of physical storage.</p>
	<details class="links"><summary>source</summary>
	<p><a href="https://www.w3.org/TR/charmod/#sec-Storage">Units of storage C009</a>, in <cite>Character Model for the World Wide Web 1.0: Fundamentals</cite></p>
	</details>
	</div>

<p>Computer storage and communication rely on units of physical storage and information interchange, such as bits and bytes (8-bit units, also called octets). A frequent error in specifications and implementations is the equating of characters with units of physical storage. The mapping between characters and such units of storage is actually quite complex.</p>


<div class="req" id="char_sounds">
	<p class="advisement">Specifications, software and content MUST NOT require or depend on a one-to-one correspondence between characters and the sounds of a language.</p>
	<details class="links"><summary>source</summary>
	<p><a href="https://www.w3.org/TR/charmod/#sec-WritingSystem">Units of aural rendering C001</a>, in <cite>Character Model for the World Wide Web 1.0: Fundamentals</cite></p>
	</details>
  </div>

<p>In some scripts, characters have a close relationship to phonemes (a <span class="new-term">phoneme</span> is a minimally distinct sound in the context of a particular spoken language), while in others they are closely related to meanings. Even when characters (loosely) correspond to phonemes, this relationship may not be simple, and there is rarely a one-to-one correspondence between character and phoneme.</p>

<div class="example">
<p>The following are examples of mismatches between the term character and units of sound:</p>

<ul>
<li>In the English sentence, "<span class="quote">They were too close to the door to close it.</span>" the same character '<span class="qchar">s</span>' is used to represent both /s/ and /z/ phonemes.</li>

<li>In the English language the phoneme /k/ of "<span class="quote">cool</span>" is like the phoneme /k/ of "<span class="quote">keel</span>".</li>

<li>In many scripts a single character may represent a sequence of
phonemes, such as the syllabic characters of Japanese hiragana.</li>

<li>In many writing systems a sequence of characters may represent a single phoneme, for example '<span class="qchar">th</span>' and '<span class="qchar">ng</span>' in "<span class="quote">thing</span>".</li>
</ul>
</div>

	<div class="req" id="char_display">
	<p class="advisement">Specifications, software and content MUST NOT require or depend on a one-to-one mapping between characters and units of displayed text.</p>
	<details class="links"><summary>source</summary>
	<p><a href="https://www.w3.org/TR/charmod/#sec-VisualRenderingUnits">Units of visual rendering C002</a>, in <cite>Character Model for the World Wide Web 1.0: Fundamentals</cite>.</p>
	</details>
	<details class="links"><summary>explanations &amp; examples</summary>
	<p><a href="https://www.w3.org/TR/charmod/#sec-CharExamples">Examples of Characters, Keystrokes and Glyphs</a>, in <cite>Character Model for the World Wide Web 1.0: Fundamentals</cite>.</p>
	</details>
</div>

<p>Visual rendering introduces the notion of a <a href="https://www.w3.org/TR/i18n-glossary/#def_glyph">glyph</a>. Glyphs are defined by ISO/IEC 9541-1 as <q>a recognizable abstract graphic symbol which is independent of a specific design</q>. There is <em>not</em> a one-to-one correspondence between characters and glyphs:</p>
<ul>
<li>A single character can be represented by multiple glyphs
(each glyph is then part of the representation of that character). These glyphs
may be physically separated from one another. </li>

<li>A single glyph may represent a sequence of characters (this
is the case with ligatures, among others).</li><li>A character may be rendered with very different glyphs depending on the context.</li>

<li>A single glyph may represent different characters (e.g.
capital Latin A, capital Greek A and capital Cyrillic A).</li>
</ul>

<p>A set of glyphs makes up a <span class="new-term">font</span>. Glyphs can be
construed as the basic units of organization of the visual rendering of text,
just as characters are the basic unit of organization of encoded text.</p>

<p>See <a href="https://www.w3.org/TR/charmod/#sec-CharExamples">Examples of Characters, Keystrokes and Glyphs</a> for examples of the complexities of character to glyph mapping.</p>


<div class="req" id="char_keystroke">
	<p class="advisement">Specifications and software MUST NOT require nor depend on a single keystroke resulting in a single character, nor that a single character be input with a single keystroke (even with modifiers), nor that keyboards are the same all over the world.</p>
	<details class="links"><summary>source</summary>
	<p><a href="https://www.w3.org/TR/charmod/#sec-InputUnits">Units of input C005</a>, in <cite>Character Model for the World Wide Web 1.0: Fundamentals</cite>.</p>
	</details>
	<details class="links"><summary>explanations &amp; examples</summary>
	<p><a href="https://www.w3.org/TR/charmod/#sec-CharExamples">Examples of Characters, Keystrokes and Glyphs</a>, in <cite>Character Model for the World Wide Web 1.0: Fundamentals</cite>.</p>
	</details>
  </div>

<p>In keyboard input, it is not always the case that keystrokes and input characters correspond one-to-one. A limited number of keys can fit on a keyboard. Some keyboards will generate multiple characters from a single keypress. In other cases ('<span class="qterm">dead keys</span>') a key will generate no characters, but affect the results of subsequent keypresses. Many writing systems have far too many characters to fit on a keyboard and must rely on more complex <span class="new-term">input methods</span>, which transform keystroke sequences into character sequences. Other languages may make it necessary to input some characters with special modifier keys.</p>
<p>See <a href="https://www.w3.org/TR/charmod/#sec-CharExamples">Examples of Characters, Keystrokes and Glyphs</a> for examples of non-trivial input.</p>
</section>


<section id="char_referencemodel" class="subtopic">
<h3>Defining a Reference Processing Model</h3>

<aside class="links" id="links_char_referencemodel">
<p class="links_title">Useful background and overviews for this section</p>
<ul>
<li class="w3"><p class="link"><a href="https://www.w3.org/TR/charmod/#sec-Characters">Digital Encoding of Characters</a>, in <cite>Character Model for the World Wide Web: Fundamentals</cite>.</p></li>
</ul>
</aside>

<div class="xref"><span class="seealso">See also</span>
<p>[[[#char_ranges]]].</p>
</div>


	<div class="req" id="char_single_enc">
	<p class="advisement">Textual data objects defined by protocol or format specifications MUST be in a single character encoding.</p>
	<details class="links"><summary>explanations &amp; examples</summary>
	<p><a href="https://www.w3.org/TR/charmod/#sec-RefProcModel">Reference Processing Model C013</a>, in <cite>Character Model for the World Wide Web: Fundamentals</cite></p>
	</details>
	</div>


	<div class="req" id="char_rpm">
	<p class="advisement">All specifications that involve processing of text MUST specify the processing of text according to the Reference Processing Model described by the rest of the recommendations in this list.</p>
	<details class="links"><summary>explanations &amp; examples</summary>
	<p><a href="https://www.w3.org/TR/charmod/#sec-RefProcModel">Reference Processing Model C014</a>, in <cite>Character Model for the World Wide Web: Fundamentals</cite></p>
	</details>
	</div>

	<div class="req" id="char_unicode_chars">
	<p class="advisement">Specifications MUST define text in terms of Unicode characters, not bytes or glyphs.</p>
	<details class="links"><summary>explanations &amp; examples</summary>
	<p><a href="https://www.w3.org/TR/charmod/#sec-RefProcModel">Reference Processing Model C014</a>, in <cite>Character Model for the World Wide Web: Fundamentals</cite></p>
	</details>
	</div>

	<div class="req" id="char_transcode">
	<p class="advisement">For their textual data objects specifications MAY allow use of any character encoding which can be transcoded to a Unicode encoding form.</p>
	<details class="links"><summary>explanations &amp; examples</summary>
	<p><a href="https://www.w3.org/TR/charmod/#sec-RefProcModel">Reference Processing Model C014</a>, in <cite>Character Model for the World Wide Web: Fundamentals</cite></p>
	</details>
	</div>


	<div class="req" id="char_as_unicode">
	<p class="advisement">Specifications MAY choose to disallow or deprecate some character encodings and to make others mandatory. Independent of the actual character encoding, the specified behavior MUST be the same as if the processing happened as follows: (a) The character encoding of any textual data object received by the application implementing the specification MUST be determined and the data object MUST be interpreted as a sequence of Unicode characters - this MUST be equivalent to transcoding the data object to some Unicode encoding form, adjusting any character encoding label if necessary, and receiving it in that Unicode encoding form, (b) All processing MUST take place on this sequence of Unicode characters, (c) If text is output by the application, the sequence of Unicode characters MUST be encoded using a character encoding chosen among those allowed by the specification.</p>
	<details class="links"><summary>explanations &amp; examples</summary>
	<p><a href="https://www.w3.org/TR/charmod/#sec-RefProcModel">Reference Processing Model C014</a>, in <cite>Character Model for the World Wide Web: Fundamentals</cite></p>
	</details>
	</div>

	<div class="req" id="char_different_enc">
	<p class="advisement">If a specification is such that multiple textual data objects are involved (such as an XML document referring to external parsed entities), it MAY choose to allow these data objects to be in different character encodings. In all cases, the Reference Processing Model MUST be applied to all textual data objects.</p>
	<details class="links"><summary>explanations &amp; examples</summary>
	<p><a href="https://www.w3.org/TR/charmod/#sec-RefProcModel">Reference Processing Model C014</a>, in <cite>Character Model for the World Wide Web: Fundamentals</cite></p>
	</details>
	</div>
  </section>


<section id="char_ranges" class="subtopic">
<h3>Including and excluding character ranges </h3>

<p class="reviewComments"><a href="https://github.com/w3c/i18n-activity/labels/t%3Achar_ranges" target="_blank">See related review comments.</a></p>

<aside class="links" id="links_char_ranges">
<p class="links_title">Useful background and overviews for this section</p>
<ul>
<li class="w3"><p class="link"><a href="https://www.w3.org/TR/charmod/#sec-Characters">Digital Encoding of Characters</a>, in <cite>Character Model for the World Wide Web: Fundamentals</cite>.</p></li>
</ul>
</aside>

<div class="xref"><span class="seealso">See also</span>
<p>[[[#char_pua]]].</p>
</div>


	<div class="req" id="char_exclude">
	<p class="advisement">Specifications SHOULD NOT arbitrarily exclude code points from the full range of Unicode code points from U+0000 to U+10FFFF inclusive.</p>
	<details class="links"><summary>explanations &amp; examples</summary>
	<p><a href="https://www.w3.org/TR/charmod/#sec-RefProcModel">Reference Processing Model C070</a>, in <cite>Character Model for the World Wide Web: Fundamentals</cite>.</p>
	</details>
	</div>

	<div class="req" id="char_10ffff">
	<p class="advisement">Specifications MUST NOT allow code points above U+10FFFF.</p>
	<details class="links"><summary>explanations &amp; examples</summary>
	<p><a href="https://www.w3.org/TR/charmod/#sec-RefProcModel">Reference Processing Model C077</a>, in <cite>Character Model for the World Wide Web: Fundamentals</cite>.</p>
	</details>
	</div>

	<div class="req" id="char_internal_use">
	<p class="advisement">Specifications SHOULD NOT allow the use of codepoints reserved by Unicode for internal use.</p>
	<details class="links"><summary>explanations &amp; examples</summary>
	<p><a href="https://www.w3.org/TR/charmod/#sec-RefProcModel">Reference Processing Model C079</a>, in <cite>Character Model for the World Wide Web: Fundamentals</cite>.</p>
	</details>
	</div>

	<div class="req" id="char_surrogate">
	<p class="advisement">Specifications MUST NOT allow the use of unpaired surrogate code points.</p>
	<details class="links"><summary>explanations &amp; examples</summary>
	<p><a href="https://www.w3.org/TR/charmod/#sec-RefProcModel">Reference Processing Model C078</a>, in <cite>Character Model for the World Wide Web: Fundamentals</cite>.</p>
	</details>
	</div>
    
    <p>A "surrogate code point" refers here to the use of character values in the range <code>U+D800</code> through <code>U+DFFF</code> inclusive. These code points are reserved to allow the UTF-16 character encoding to address <a>supplementary characters</a>. Surrogates are always used in pairs and only appear when the UTF-16 encoding is being used. A single surrogate code point is referred to as an "unpaired surrogate" and should never be used.</p>

	<div class="req" id="char_compatibility">
	<p class="advisement">Specifications SHOULD exclude compatibility characters in the syntactic elements (markup, delimiters, identifiers) of the formats they define.</p>
	<details class="links"><summary>explanations &amp; examples</summary>
	<p><a href="https://www.w3.org/TR/charmod/#sec-Compatibility">Compatibility and Formatting Characters C050</a>, in <cite>Character Model for the World Wide Web: Fundamentals</cite>.</p>
	</details>
	</div>

	<div class="req" id="char_user_defined_values">
	<p class="advisement">Specifications SHOULD allow the full range of Unicode for user-defined values.</p>
	<details class="links"><summary>explanations &amp; examples</summary>
	<p><a href="https://www.w3.org/TR/charmod-norm/#sec_unicode_cs">Unicode case-insensitive matching</a>, in <cite>Character Model for the World Wide Web: Fundamentals</cite>.</p>
	</details>
	</div>
  </section>


<section id="char_pua" class="subtopic">
<h3>Using the Private Use Area</h3>

<div class="xref"><span class="seealso">See also</span>
<p>[[[#char_ranges]]].</p>
</div>

	<div class="req" id="char_not_pua">
	<p class="advisement">Specifications MUST NOT require the use of private use area characters with particular assignments.</p>
	<details class="links"><summary>explanations &amp; examples</summary>
	<p><a href="https://www.w3.org/TR/charmod/#sec-PrivateUse">Private use code points, C038</a>, in <cite>Character Model for the World Wide Web: Fundamentals</cite></p>
	</details>
	</div>

	<div class="req" id="char_pua_mechanisms">
	<p class="advisement">Specifications MUST NOT require the use of mechanisms for defining agreements of private use code points.</p>
	<details class="links"><summary>explanations &amp; examples</summary>
	<p><a href="https://www.w3.org/TR/charmod/#sec-PrivateUse">Private use code points, C039</a>, in <cite>Character Model for the World Wide Web: Fundamentals</cite></p>
	</details>
	</div>

	<div class="req" id="char_pua_allow">
	<p class="advisement">Specifications and implementations SHOULD NOT disallow the use of private use code points by private agreement.</p>
	<details class="links"><summary>explanations &amp; examples</summary>
	<p><a href="https://www.w3.org/TR/charmod/#sec-PrivateUse">Private use code points, C040</a>, in <cite>Character Model for the World Wide Web: Fundamentals</cite></p>
	</details>
	</div>

	<div class="req" id="char_symbols">
	<p class="advisement">Specifications MAY define markup to allow the transmission of symbols not in Unicode or to identify specific variants of Unicode characters.</p>
	<details class="links"><summary>explanations &amp; examples</summary>
	<p><a href="https://www.w3.org/TR/charmod/#sec-PrivateUse">Private use code points, C041</a>, in <cite>Character Model for the World Wide Web: Fundamentals</cite></p>
	</details>
	</div>

	<div class="req" id="char_pictures">
	<p class="advisement">Specifications SHOULD allow the inclusion of or reference to pictures and graphics where appropriate, to eliminate the need to (mis)use character-oriented mechanisms for pictures or graphics.</p>
	<details class="links"><summary>explanations &amp; examples</summary>
	<p><a href="https://www.w3.org/TR/charmod/#sec-PrivateUse">Private use code points, C068</a>, in <cite>Character Model for the World Wide Web: Fundamentals</cite></p>
	</details>
	</div>
  </section>


<section id="char_choosing" class="subtopic">
<h3>Choosing character encodings</h3>

<p class="reviewComments"><a href="https://github.com/w3c/i18n-activity/labels/t%3Achar_choosing" target="_blank">See related review comments.</a></p>


<aside class="links" id="links_char_choosing">
<p class="links_title">Useful background and overviews for this section</p>
<ul>
<li class="w3"><p class="link"><a href="https://www.w3.org/TR/charmod/#sec-Encodings">Choice and identification of code points</a> in <cite>Character Model for the World Wide Web: Fundamentals</cite>.</p></li>
<li class="w3"><p class="link"><a href="https://www.w3.org/International/questions/qa-doc-charset">Document character set</a> — what it is, and how it relates to the encodings used for a document.</p></li>
</ul>
</aside>


	<div class="req" id="char_identification">
	<p class="advisement">Specifications MUST either specify a unique character encoding, or provide character encoding identification mechanisms such that the encoding of text can be reliably identified.</p>
	<details class="links"><summary>explanations &amp; examples</summary>
	<p><a href="https://www.w3.org/TR/charmod/#sec-Encodings">Choice and Identification of Character Encodings, C015</a>, in <cite>Character Model for the World Wide Web: Fundamentals</cite></p>
	</details>
	</div>

	<div class="req" id="char_unique_for_new">
	<p class="advisement">When designing a new protocol, format or API, specifications SHOULD require a unique character encoding.</p>
	<details class="links"><summary>explanations &amp; examples</summary>
	<p><a href="https://www.w3.org/TR/charmod/#sec-Encodings">Choice and Identification of Character Encodings, C016</a>, in <cite>Character Model for the World Wide Web: Fundamentals</cite></p>
	</details>
	</div>

	<div class="req" id="char_enc_rules">
	<p class="advisement">When basing a protocol, format, or API on a protocol, format, or API that already has rules for character encoding, specifications SHOULD use rather than change these rules.</p>
	<details class="links"><summary>explanations &amp; examples</summary>
	<p><a href="https://www.w3.org/TR/charmod/#sec-Encodings">Choice and Identification of Character Encodings, C017</a>, in <cite>Character Model for the World Wide Web: Fundamentals</cite></p>
	</details>
	</div>


	<div class="req" id="char_use_utf8">
	<p class="advisement">When a unique character encoding is required, the character encoding MUST be UTF-8, or UTF-16.</p>
	<details class="links"><summary>explanations &amp; examples</summary>
	<p><a href="https://www.w3.org/TR/charmod/#sec-UniqueEncoding">Mandating a unique character encoding, C018</a>, in <cite>Character Model for the World Wide Web: Fundamentals</cite></p>
	</details>
	</div>

<p class="note">The above guideline needs further consideration: UTF-16 and UTF-32 are not recommended these days. UTF-8 is the recommended encoding.</p>
	
	
 	<div class="req" id="char_charset">
	<p class="advisement">Specifications SHOULD avoid using the terms 'character set' and 'charset' to refer to a character encoding, except when the latter is used to refer to the MIME charset parameter or its IANA-registered values. The term 'character encoding', or in specific cases the terms 'character encoding form' or 'character encoding scheme', are RECOMMENDED.</p>
	<details class="links"><summary>explanations &amp; examples</summary>
	<p><a href="https://www.w3.org/TR/charmod/#sec-EncodingIdent">Mandating a unique character encoding, C020</a>, in <cite>Character Model for the World Wide Web: Fundamentals</cite></p>
	</details>
	</div>

 	<div class="req" id="char_iana">
	<p class="advisement">If the unique encoding approach is not taken, specifications SHOULD require the use of the IANA charset registry names, and in particular the names identified in the registry as 'MIME preferred names', to designate character encodings in protocols, data formats and APIs.</p>
	<details class="links"><summary>explanations &amp; examples</summary>
	<p><a href="https://www.w3.org/TR/charmod/#sec-EncodingIdent">Character encoding identification, C021</a>, in <cite>Character Model for the World Wide Web: Fundamentals</cite></p>
	</details>
	</div>
	
    <p class="note">The above guideline needs further consideration: the list of character encodings recommended for Web specifications is listed in the Encoding specification.</p>

 	<div class="req" id="char_non_iana">
	<p class="advisement">Character encodings that are not in the IANA registry SHOULD NOT be used, except by private agreement.</p>
	<details class="links"><summary>explanations &amp; examples</summary>
	<p><a href="https://www.w3.org/TR/charmod/#sec-EncodingIdent">Character encoding identification, C022</a>, in <cite>Character Model for the World Wide Web: Fundamentals</cite></p>
	</details>
	</div>

 	<div class="req" id="char_x">
	<p class="advisement">If an unregistered character encoding is used, the convention of using 'x-' at the beginning of the name MUST be followed.</p>
	<details class="links"><summary>explanations &amp; examples</summary>
	<p><a href="https://www.w3.org/TR/charmod/#sec-EncodingIdent">Character encoding identification, C023</a>, in <cite>Character Model for the World Wide Web: Fundamentals</cite></p>
	</details>
	</div>

 	<div class="req" id="char_not_unique">
	<p class="advisement">If the unique encoding approach is not chosen, specifications MUST designate at least one of the UTF-8 and UTF-16 encoding forms of Unicode as admissible character encodings and SHOULD choose at least one of UTF-8 or UTF-16 as required encoding forms (encoding forms that MUST be supported by implementations of the specification).</p>
	<details class="links"><summary>explanations &amp; examples</summary>
	<p><a href="https://www.w3.org/TR/charmod/#sec-EncodingIdent">Character encoding identification, C026</a>, in <cite>Character Model for the World Wide Web: Fundamentals</cite></p>
	</details>
	</div>

<div class="req" id="char_default">
	<p class="advisement">Specifications that require a default encoding MUST define either UTF-8 or UTF-16 as the default, or both if they define suitable means of distinguishing them.</p>
	<details class="links"><summary>explanations &amp; examples</summary>
	<p><a href="https://www.w3.org/TR/charmod/#sec-EncodingIdent">Character encoding identification, C027</a>, in <cite>Character Model for the World Wide Web: Fundamentals</cite></p>
	</details>
</div>
  </section>


<section id="char_identifying" class="subtopic">
<h3>Identifying character encodings</h3>


<aside class="links" id="links_char_identifying">
<p class="links_title">Useful background and overviews for this section</p>
<ul>
<li class="w3"><p class="link"><a href="https://www.w3.org/TR/charmod/#sec-Encodings">Choice and identification of code points</a> in <cite>Character Model for the World Wide Web: Fundamentals</cite>.</p></li>
<li class="w3"><p class="link"><a href="https://www.w3.org/International/questions/qa-doc-charset">Document character set</a> — what it is, and how it relates to the encodings used for a document.</p></li>
</ul>
</aside>


 	<div class="req" id="char_heuristics">
	<p class="advisement">Specifications MUST NOT propose the use of heuristics to determine the encoding of data.</p>
	<details class="links"><summary>explanations &amp; examples</summary>
	<p><a href="https://www.w3.org/TR/charmod/#sec-EncodingIdent">Character encoding identification, C028</a>, in <cite>Character Model for the World Wide Web: Fundamentals</cite></p>
	</details>
	</div>

 	<div class="req" id="char_conflict">
	<p class="advisement">Specifications MUST define conflict-resolution mechanisms (e.g. priorities) for cases where there is multiple or conflicting information about character encoding.</p>
	<details class="links"><summary>explanations &amp; examples</summary>
	<p><a href="https://www.w3.org/TR/charmod/#sec-EncodingIdent">Character encoding identification, C028</a>, in <cite>Character Model for the World Wide Web: Fundamentals</cite></p>
	</details>
	</div>
  </section>


<section id="char_escapes" class="subtopic">
<h3>Designing character escapes</h3>


<p class="reviewComments"><a href="https://github.com/w3c/i18n-activity/labels/t%3Achar_escapes" target="_blank">See related review comments.</a></p>

<aside class="links" id="links_char_escapes">
<p class="links_title">Useful background and overviews for this section</p>
<ul>
<li class="w3"><p class="link"><a href="https://www.w3.org/TR/charmod/#sec-Escaping">Character Escaping</a>, in <cite>Character Model for the World Wide Web: Fundamentals</cite>.</p></li>
</ul>
</aside>


 	<div class="req" id="char_escaping">
	<p class="advisement">Specifications should provide a mechanism for escaping characters, particularly those which are invisible or ambiguous.</p>
	<details class="links"><summary>more</summary>
	<p><a href="https://www.w3.org/International/questions/qa-escapes">Using character escapes in markup and CSS</a>, W3C article.</p>
	</details>
	</div>

    <p>It is generally recommended that character escapes be provided so that difficult to enter or edit sequences can be introduced using a plain text editor. Escape sequences are particularly useful for invisible or ambiguous Unicode characters, including zero-width spaces, soft-hyphens, various bidi controls, mongolian vowel separators, etc. </p>
<p>For advice on use of escapes in markup, but which is mostly generalisable to other formats, see <a href="https://www.w3.org/International/questions/qa-escapes">Using character escapes in markup and CSS</a>.</p>


 	<div class="req" id="char_esc_new">
	<p class="advisement">Specifications SHOULD NOT invent a new escaping mechanism if an appropriate one already exists.</p>
	<details class="links"><summary>explanations &amp; examples</summary>
	<p><a href="https://www.w3.org/TR/charmod/#sec-Escaping">Character Escaping, C042</a>, in <cite>Character Model for the World Wide Web: Fundamentals</cite></p>
	</details>
	</div>

	
	<div class="example" id="char-escape-examples">
		<p>Here are some examples of common escaping mechanisms found on the Web or in common programming languages. The example character here is <span class="codepoint"><bdi lang="en">&#x1f63d;</bdi><code class="uname" translate="no">U+1F63D KISSING CAT FACE WITH CLOSED EYES</code></span>.</p>
		<table>
			<thead>
				<tr>
					<th style="width:20%">Found In</th>
					<th>Type</th>
					<th style="width:15%">Example</th>
					<th>Description</th>
				</tr>
			</thead>
			<tbody>
				<tr>
					<td rowspan="2">HTML, XML</td>
					<td>Hex <abbr title="numeric character references">NCRs</abbr></td>
					<td><code>&amp;#x1F63D;</code></td>
					<td>Hexadecimal encoding of the Unicode code point</td>
				</tr>
				<tr>
					<td>Decimal <abbr title="numeric character references">NCRs</abbr></td>
					<td><code>&amp;#128573;</code></td>
					<td>Decimal encoding of the Unicode code point</td>
				</tr>
				<tr>
					<td>JavaScript, Ruby, Rust, [[UTS18]]</td>
					<td><code>\u</code> delimited</td>
					<td><code>\u{1F63D}</code></td>
					<td>Hexadecimal encoding of the Unicode code point</td>
				</tr>
				<tr>
					<td>Perl</td>
					<td><code>\x</code> delimited</td>
					<td><code>\x{1F63D}</code></td>
					<td>Hexadecimal encoding of the Unicode code point; uses <code>x</code> instead of the more common <code>u</code></td>
				</tr>
				<tr>
					<td>Java, JavaScript, JSON, C, C++, Python</td>
    				<td><code>\u</code> UTF-16 code units</td>
					<td><code>\uD83D\uDE3D</code></td>
					<td>Fixed-width hexadecimal encoding of UTF-16 code units; <a>supplementary characters</a> are encoded as a surrogate pair</td>
				</tr>
				<tr>
					<td>C, C++, Python</td>
					<td><code>\U</code> UTF-32 code units</td>
					<td><code>\U0001f63d</code></td>
					<td>Fixed-width hexadecimal encoding of UTF-32 code units; most often used together with <code>\u</code> escapes (which are more efficient for the more-common <a>BMP</a> characters).<br>For example, <code>\u00c0 \U0001f63d \u12fe</code></td>
				</tr>
				<tr>
					<td>URLs</td>
					<td>URL Encode</td>
					<td><code>%F0%9F%98%BD</code></td>
					<td>Hexadecimal encoding of UTF-8 bytes; each byte requires three characters; each code point requires from 1 to 4 bytes</td>
				</tr>
			</tbody>
		</table>
		

		<p>When choosing an escaping mechanism, note that hexadecimal is generally preferred to decimal encodings, due to the common use of hexadecimal in the Unicode Standard and its references.</p>
	</div>
 	<div class="req" id="char_esc_alternates">
	<p class="advisement">The number of different ways to escape a character SHOULD be minimized (ideally to one).</p>
	<details class="links"><summary>explanations &amp; examples</summary>
	<p><a href="https://www.w3.org/TR/charmod/#sec-Escaping">Character Escaping, C043</a>, in <cite>Character Model for the World Wide Web: Fundamentals</cite></p>
	</details>
	</div>

 	<div class="req" id="char_esc_end">
	<p class="advisement">Escape syntax SHOULD require either explicit end delimiters or a fixed number of characters in each character escape. Escape syntaxes where the end is determined by any character outside the set of characters admissible in the character escape itself SHOULD be avoided.</p>
	<details class="links"><summary>explanations &amp; examples</summary>
	<p><a href="https://www.w3.org/TR/charmod/#sec-Escaping">Character Escaping, C044</a>, in <cite>Character Model for the World Wide Web: Fundamentals</cite></p>
	</details>
	</div>

 	<div class="req" id="char_esc_hex">
	<p class="advisement">Whenever specifications define character escapes that allow the representation of characters using a number, the number MUST represent the Unicode code point of the character and SHOULD be in hexadecimal notation.</p>
	<details class="links"><summary>explanations &amp; examples</summary>
	<p><a href="https://www.w3.org/TR/charmod/#sec-Escaping">Character Escaping, C045</a>, in <cite>Character Model for the World Wide Web: Fundamentals</cite></p>
	</details>
	</div>

 	<div class="req" id="char_esc_acceptable">
	<p class="advisement">Escaped characters SHOULD be acceptable wherever their unescaped forms are; this does not preclude that syntax-significant characters, when escaped, lose their significance in the syntax. In particular, if a character is acceptable in identifiers and comments, then its escaped form should also be acceptable.</p>
	<details class="links"><summary>explanations &amp; examples</summary>
	<p><a href="https://www.w3.org/TR/charmod/#sec-Escaping">Character Escaping, C046</a>, in <cite>Character Model for the World Wide Web: Fundamentals</cite></p>
	</details>
	</div>
  </section>


<section id="char_storing" class="subtopic">
<h3>Storing text</h3>


<aside class="links" id="links_char_storing">
<p class="links_title">Useful background and overviews for this section</p>
<ul>
<li class="w3"><p class="link"><a href="https://www.w3.org/TR/charmod/#sec-LogicalOrder">Visual rendering and logical order</a>, in <cite>Character Model for the World Wide Web: Fundamentals</cite>.</p></li>
</ul>
</aside>


 	<div class="req" id="char_storing_logical">
	<p class="advisement">Protocols, data formats and APIs MUST store, interchange or process text data in logical order.</p>
	<details class="links"><summary>explanations &amp; examples</summary>
	<p><a href="https://www.w3.org/TR/charmod/#sec-LogicalOrder">Visual Rendering and Logical Order, C003</a>, in <cite>Character Model for the World Wide Web: Fundamentals</cite>.</p>
	</details>
	</div>

 	<div class="req" id="char_logical_storage">
	<p class="advisement">Independent of whether some implementation uses logical selection or visual selection, characters selected MUST be kept in logical order in storage.</p>
	<details class="links"><summary>explanations &amp; examples</summary>
	<p><a href="https://www.w3.org/TR/charmod/#sec-LogicalOrder">Visual Rendering and Logical Order, C075</a>, in <cite>Character Model for the World Wide Web: Fundamentals</cite>.</p>
	</details>
	</div>

 	<div class="req" id="char_storing_discontiguous">
	<p class="advisement">Specifications of protocols and APIs that involve selection of ranges SHOULD provide for discontiguous logical selections, at least to the extent necessary to support implementation of visual selection on screen on top of those protocols and APIs.</p>
	<details class="links"><summary>explanations &amp; examples</summary>
	<p><a href="https://www.w3.org/TR/charmod/#sec-LogicalOrder">Visual Rendering and Logical Order, C004</a>, in <cite>Character Model for the World Wide Web: Fundamentals</cite>.</p>
	</details>
	</div>
</section>


<section id="char_string" class="subtopic">
<h3>Defining 'string'</h3>


<aside class="links" id="links_char_string">
<p class="links_title">Useful background and overviews for this section</p>
<ul>
<li class="w3"><p class="link"><a href="https://www.w3.org/TR/charmod/#sec-Strings">String concepts</a>, in <cite>Character Model for the World Wide Web: Fundamentals</cite>.</p></li>
</ul>
</aside>

<div class="xref"><span class="seealso">See also</span>
<p>[[[#char_indexing]]].</p>
<p>[[[#char_def]]].</p>
</div>

<aside class="note">
    <p>The best practices found in this section are intended to be mutually consistent with those in [[DESIGN-PRINCIPLES]]. The definitions in this section use terms found in the <cite>Internationalization Glossary</cite> [[I18N-GLOSSARY]]. Some of these definitions are themselves taken from [[WEBIDL]], [[INFRA]], or the Unicode glossary; in which case the definitions are quoted verbatim and include links to their source. Please refer to instructions in the Internationalization Glossary for how to import and link definitions in your own specification.</p>
</aside>

<div class="issue">
	<p>Notwithstanding the note just above, I18N's best practices appear to be exactly opposite those in [[DESIGN-PRINCIPLES]] at the moment. The details turn out to be the same, but we need to resolve differences in guidance and wording. The issue <a href="https://github.com/w3ctag/design-principles/issues/454">design-principles#454</a> tracks this.</p>
</div>

<div class="req" id="char_string_default">
	<p class="advisement">Unless you have a reason not to, use a string definition consistent with {{USVString}}.</p>
</div>

<div class="req" id="char_string_dom">
	<p class="advisement">Use a string definition consistent with {{DOMString}} if your specification does not process the internal value of strings and is not required to check for unpaired surrogate code points, or if your specification pertains to the [[DOM]], defines a JavaScript API or data format, or defines strings as opaque values that are not processed.</p>
	<details class="links"><summary>explanations &amp; examples</summary>
	   <p><a href="https://infra.spec.whatwg.org/#scalar-value-string">Scalar value string</a> definition in [[INFRA]]</p>
	   <p><a href="https://www.w3.org/TR/design-principles/#idl-string-types">IDL String Types</a> in <cite>Web Platform Design Principles</cite> [[DESIGN-PRINCIPLES]]</p>
	   <p><a href="https://www.w3.org/TR/charmod/#sec-Strings">String concepts, C012</a>, in <cite>Character Model for the World Wide Web: Fundamentals</cite>.</p>
	</details>
</div>

<p>A string is a sequence of characters. Because [[UNICODE]] is fundamental to understanding and working with text, including text that uses <a>legacy character encodings</a>, the basic definition of a string depends on Unicode and its concept of a encoded character. Specifically:</p> 

<p class="localdef">A <dfn class="lint-ignore">string</dfn> is a well-formed sequence of zero or more <a>Unicode Scalar Values</a>.</p>

<p>Because there are multiple ways of working with strings, different terminology has evolved to support the needs of different specifications. Be sure to understand your specification's needs and use the most appropriate and precise terminology. On the Web, there are three types of strings:

<ul>
	<li>{{USVString}}. Strings based on Unicode <a>code points</a>, also known as </a><a>Unicode Scalar Values</a></li>
	<li>{{DOMString}}. Strings based on <a href="https://www.w3.org/TR/i18n-glossary/#dfn-utf-16">UTF-16</a> <a>code units</a></li>
	<li>{{ByteString}}. Strings based on bytes in some <a>character encoding form</a> (preferably <a>UTF-8</a>)</li>
</ul></p>

<p>One difference between these different string types is how <a>surrogate</a> <a>code points</a> are handled. Note the difference between a <a>code point</a> (which represents a <a>Unicode Scalar Value</a>, i.e. a character) and a <a>code unit</a> (a unit of encoding in a <a>character encoding form</a>).</p>

<p>The <a href="https://www.w3.org/TR/i18n-glossary/#dfn-utf-16">UTF-16</a> <a>character encoding form</a> uses 16-bit <a>code units</a>. Characters whose <a>scalar values</a> require more than 16-bits are encoded using a pair of <a>surrogate</a> <a>code units</a>: a "low surrogate" (in the range <code class="uname">U+D800-U+DBFF</code>) followed by a "high surrogate" (in the range <code class="uname">U+DC00-U+DFFF</code>). Unicode reserves the <a>code points</a> in these ranges as non-characters so that there is no confusion between the <a>code units</a> in <a href="https://www.w3.org/TR/i18n-glossary/#dfn-utf-16">UTF-16</a> and normal text.</p>

<p>In a {{USVString}}, isolated <a>surrogate</a> code points are invalid and implementations are required to replace any found in a string with the Unicode replacment character (<span class="codepoint" translate="no"><bdi lang="und">&#xFFFD;</bdi><code class="uname">U+FFFD REPLACEMENT CHARACTER</code></span>). For strings whose most common algorithms operate on scalar values (such as percent-encoding), or for operations which can’t handle surrogates in input (such as APIs that pass strings through to native platform APIs), {{USVString}} should be used. Any of these references are equivalent to this:
	<ul>
	    <li>{{USVString}} [[WEBIDL]]</li>
	    <li><a>scalar value string</a> [[INFRA]]</li>
	    <li><a target="_blank" href="https://www.w3.org/TR/xmlschema11-2/#string">xsd:string</a> [[XMLSCHEMA11-2]]</li>
    </ul>
</p>

<p>In a {{DOMString}}, unpaired <a>surrogate</a> <a>code units</a> can appear in a string. Most string operations don’t need to interpret the <a>code units</a> inside of strings. Specifying {{DOMString}} means that implementations are not required to validate the contents of the string, making this the ideal string type for most data structures, formats, or APIs. The [[DOM]] and JavaScript strings use {{DOMString}} as their string type and the [[INFRA]] standard defines the term 'string' to mean a {{DOMString}}:</p>

<p class="localdef">A string is a sequence of unsigned 16-bit integers, also known as <a>code units</a>.</p>

<p class="note">[[INFRA]]'s use of the term <a>code unit</a> refers specifically to the <a href="https://www.w3.org/TR/i18n-glossary/#dfn-utf-16">UTF-16</a> character encoding's code units, rather than the more general definition of a <a>code unit</a> that can refer to different size values, such as bytes, in any <a>character encoding form</a>.</p>

<p>A {{ByteString}} depends on the <a>character encoding form</a> used to encode characters into bytes. <a>Legacy character encodings</a> do not have a concept of "surrogates", so there is generally no way to encode a surrogate code point. Valid <a>UTF-8</a> does not permit surrogate code points: these are replaced by <span class="codepoint" translate="no"><bdi lang="und">&#xFFFD;</bdi><code class="uname">U+FFFD REPLACEMENT CHARACTER</code></span> when encoding or decoding text in <a>UTF-8</a>. When converting <a href="https://www.w3.org/TR/i18n-glossary/#dfn-utf-16">UTF-16</a> to <a>UTF-8</a>, any <a>surrogate pairs</a> are transformed into the proper UTF-8 byte sequence encoding the specific <a>scalar value</a>.</p>

<div class="req" id="char_string_no_legacy">
	<p class="advisement">Specifications SHOULD NOT add or define support for <a>legacy character encodings</a> unless there is a specific reason to do so.</p>
	<details class="links"><summary>explanations &amp; examples</summary>
	<p>See also <a href="#char_choosing"></a>.</p>
	</details>
</div>

<div class="req" id="char_string_byte">
	<p class="advisement">Specifications SHOULD NOT define a string as a {{ByteString}} or as a sequence of bytes ('byte string'). For binary data or sequences of bytes, use {{Uint8Array}} instead.</p>
	<details class="links"><summary>explanations &amp; examples</summary>
	   <p><a href="https://www.w3.org/TR/charmod/#sec-Strings">String concepts, C011</a>, in <cite>Character Model for the World Wide Web: Fundamentals</cite>.</p>
	   <p><a href="https://www.w3.org/TR/string-meta/#protocol-strings">Strings that are part of a legacy protocol or format</a>, in <cite>Strings on the Web: Language and Direction Metadata</cite> [[STRING-META]]</p>
	   <p><a href="https://www.w3.org/TR/design-principles/#idl-string-types">IDL String Types</a> in <cite>Web Platform Design Principles</cite> [[DESIGN-PRINCIPLES]]</p>
	</details>
</div>

<p>The type {{ByteString}} defines strings as sequences of bytes (octets). Interpretation of byte strings thus requires the specification of a <a>character encoding form</a>. UTF-8 is the preferred encoding for wire and document formats [[ENCODING]], but there is generally no reason to specify strings in terms of the underlying byte values.</p>

<aside class="note">
	<p>Specifications for document formats or protocols often deal with the specific byte values used for various fields or values or with the <a>character encoding</a> used for serializing the data. It is therefore tempting to specify a text field ("string") as a {{ByteString}} which uses the <a>UTF-8</a> <a>character encoding form</a>.</p>
	
	<p>It is preferable, however, to specify these fields as a {{DOMString}} (or, rarely, a {{USVString}}), since the data encoded into these fields must be serialized from and deserialized into in-memory string representations, such as the [[DOM]] or JavaScript strings or your platform's native Unicode string type.</p>
</aside>

<p>See <a href="#char_choosing"></a> for additional best practices.</p>

</section>

<section id="char_whitespace" class="subtopic">
	<h3>Whitespace characters</h3>

	<p class="reviewComments"><a href="https://github.com/w3c/i18n-activity/labels/t%3Achar_whitespace" target="_blank">See related review comments.</a></p>
	
	<div class="xref"><span class="seealso">See also</span>
	<p>[[[#markup_identifiers]]].</p>
	</div>

	<p>Whitespace characters are characters that represent horizontal or vertical space in typography. Whitespace characters can have different visual effects: some whitespace characters have no visible effect, while others represent larger, smaller, or variable amounts of space on the page.</p>

	<div class="req" id="char_define_whitespace">
		<p class="advisement">Specifications that use the term "whitespace" SHOULD explicitly define what the term means.</p>
		<details class="links">
			<summary>explanations &amp; examples</summary>
			<ul>
				<li><a href="https://infra.spec.whatwg.org/#ascii-whitespace">ASCII whitespace</a>, in <cite>Infra Living Standard</cite></li>
				<li><a href="https://www.w3.org/TR/css-syntax/#whitespace">Whitespace</a>, in <cite>CSS Syntax</cite></li>
				<li><a href="https://www.w3.org/TR/css-text/#white-space">Document white space characters</a>, in <cite>CSS Text</cite></li>
				<li><a href="https://tc39.es/ecma262/#sec-white-space">White Space</a>, in <cite>ECMAScript Language Specification</cite></li>
				<li><a href="https://www.w3.org/TR/WGSL/#blankspace-and-line-breaks">Blankspace and line breaks</a>, in <cite>WebGPU Shading Language</cite></li>
				<li><a href="https://www.w3.org/TR/webdriver/#dfn-whitespace">Whitespace</a>, in <cite>WebDriver</cite></li>
				<li><a href="https://www.w3.org/TR/xml/#NT-S">S (white space)</a>, in <cite>Extensible Markup Language (XML) 1.0</cite></li>
			</ul>
		</details>
	</div>

	<div class="req" id="char_unicode_white_space">
		<p class="advisement">Most specifications SHOULD define whitespace to mean characters with the Unicode <a href="https://www.unicode.org/reports/tr44/#White_Space">White_Space</a> property.</p>
	</div>

	<div class="req" id="char_ascii_whitespace">
		<p class="advisement">Specifications that define whitespace for use in <a href="https://www.w3.org/TR/charmod-norm/#def_vocabulary">vocabularies</a> that are restricted to ASCII or to formats that are whitespace delimited (examples include HTML or CSS) SHOULD specify <a href="https://infra.spec.whatwg.org/#ascii-whitespace">ASCII whitespace</a> as part of their grammar.</p>
	</div>

	<div class="req" id="char_whitespace_code_points">
		<p class="advisement">If a specification defines "whitespace" differently from ASCII or Unicode whitespace, the specific code points MUST be listed.</p>
	</div>

	<p>
		Some specifications, such as <a href="https://tc39.es/ecma262/#sec-white-space">ECMAScript</a>, have provided their own definition of whitespace which differ from the above to meet their own specific requirements.
	</p>

	<p>
		The following table is the definition of whitespace characters in various specifications.
	</p>

		<table class="whitespace">
			<caption>Links to the latest definitions of the information in the table can be found by expanding the "explanations & examples".</caption>
			<thead>
				<tr>
					<th>&nbsp;</th>
					<th><code class="kw" translate="no">white_space</code> property</th>
					<th><code class="kw" translate="no">pattern_white_space</code> property</th>
					<th><a href="https://infra.spec.whatwg.org/#ascii-whitespace">ASCII whitespace</a> (HTML)</th>
					<th>CSS whitespace</th>
					<th>ECMAScript</th>
					<th>XML</th>
				</tr>
			</thead>
			<tbody>
				<tr>
					<td><span class="codepoint" translate="no"><img alt="HTAB" src="images/0009.png"><code class="uname" translate="no">U+0009 (horizontal tab)</code></span></td>
					<td>✓</td>
					<td>✓</td>
					<td>✓</td>
					<td>✓</td>
					<td>✓</td>
					<td>✓</td>
				</tr>
				<tr>
					<td><span class="codepoint" translate="no"><img alt="LF" src="images/000A.png"><code class="uname" translate="no">U+000A (line feed)</code></span></td>
					<td>✓</td>
					<td>✓</td>
					<td>✓</td>
					<td>✓</td>
					<td>&nbsp;</td>
					<td>✓</td>
				</tr>
				<tr>
					<td><span class="codepoint" translate="no"><img alt="VTAB" src="images/000B.png"><code class="uname" translate="no">U+000B (vertical tab)</code></span></td>
					<td>✓</td>
					<td>✓</td>
					<td>&nbsp;</td>
					<td>&nbsp;</td>
					<td>✓</td>
					<td>&nbsp;</td>
				</tr>
				<tr>
					<td><span class="codepoint" translate="no"><img alt="FF" src="images/000C.png"><code class="uname" translate="no">U+000C (form feed)</code></span></td>
					<td>✓</td>
					<td>✓</td>
					<td>✓</td>
					<td>&nbsp;</td>
					<td>✓</td>
					<td>&nbsp;</td>
				</tr>
				<tr>
					<td><span class="codepoint" translate="no"><img alt="CR" src="images/000D.png"><code class="uname" translate="no">U+000D (carriage return)</code></span></td>
					<td>✓</td>
					<td>✓</td>
					<td>✓</td>
					<td>&nbsp;</td>
					<td>&nbsp;</td>
					<td>✓</td>
				</tr>
				<tr>
					<td><span class="codepoint" translate="no"><img alt="SP" src="images/0020.png"><code class="uname" translate="no">U+0020 SPACE</code></span></td>
					<td>✓</td>
					<td>✓</td>
					<td>✓</td>
					<td>✓</td>
					<td>✓</td>
					<td>✓</td>
				</tr>
				<tr>
					<td><span class="codepoint" translate="no"><img alt="NEL" src="images/0085.png"><code class="uname" translate="no">U+0085 (next line)</code></span></td>
					<td>✓</td>
					<td>✓</td>
					<td>&nbsp;</td>
					<td>&nbsp;</td>
					<td>&nbsp;</td>
					<td>&nbsp;</td>
				</tr>
				<tr>
					<td><span class="codepoint" translate="no"><img alt="NBSP" src="images/00A0.png"><code class="uname" translate="no">U+00A0 NO-BREAK SPACE</code></span></td>
					<td>✓</td>
					<td>&nbsp;</td>
					<td>&nbsp;</td>
					<td>&nbsp;</td>
					<td>✓</td>
					<td>&nbsp;</td>
				</tr>
				<tr>
					<td><span class="codepoint" translate="no"><img alt="Ogham space" src="images/1680.png"><code class="uname" translate="no">U+1680 OGHAM SPACE MARK</code></span></td>
					<td>✓</td>
					<td>&nbsp;</td>
					<td>&nbsp;</td>
					<td>&nbsp;</td>
					<td>✓</td>
					<td>&nbsp;</td>
				</tr>
				<tr>
					<td><span class="codepoint" translate="no"><img alt="NQSP" src="images/2000.png"><code class="uname" translate="no">U+2000 EN QUAD</code></span></td>
					<td>✓</td>
					<td>&nbsp;</td>
					<td>&nbsp;</td>
					<td>&nbsp;</td>
					<td>✓</td>
					<td>&nbsp;</td>
				</tr>
				<tr>
					<td><span class="codepoint" translate="no"><img alt="MQSP" src="images/2001.png"><code class="uname" translate="no">U+2001 EM QUAD</code></span></td>
					<td>✓</td>
					<td>&nbsp;</td>
					<td>&nbsp;</td>
					<td>&nbsp;</td>
					<td>✓</td>
					<td>&nbsp;</td>
				</tr>
				<tr>
					<td><span class="codepoint" translate="no"><img alt="ENSP" src="images/2002.png"><code class="uname" translate="no">U+2002 EN SPACE</code></span></td>
					<td>✓</td>
					<td>&nbsp;</td>
					<td>&nbsp;</td>
					<td>&nbsp;</td>
					<td>✓</td>
					<td>&nbsp;</td>
				</tr>
				<tr>
					<td><span class="codepoint" translate="no"><img alt="EMSP" src="images/2003.png"><code class="uname" translate="no">U+2003 EM SPACE</code></span></td>
					<td>✓</td>
					<td>&nbsp;</td>
					<td>&nbsp;</td>
					<td>&nbsp;</td>
					<td>✓</td>
					<td>&nbsp;</td>
				</tr>
				<tr>
					<td><span class="codepoint" translate="no"><img alt="3/M SP" src="images/2004.png"><code class="uname" translate="no">U+2004 THREE-PER-EM SPACE</code></span></td>
					<td>✓</td>
					<td>&nbsp;</td>
					<td>&nbsp;</td>
					<td>&nbsp;</td>
					<td>✓</td>
					<td>&nbsp;</td>
				</tr>
				<tr>
					<td><span class="codepoint" translate="no"><img alt="4/M SP" src="images/2005.png"><code class="uname" translate="no">U+2005 FOUR-PER-EM SPACE</code></span></td>
					<td>✓</td>
					<td>&nbsp;</td>
					<td>&nbsp;</td>
					<td>&nbsp;</td>
					<td>✓</td>
					<td>&nbsp;</td>
				</tr>
				<tr>
					<td><span class="codepoint" translate="no"><img alt="6/M SP" src="images/2006.png"><code class="uname" translate="no">U+2006 SIX-PER-EM SPACE</code></span></td>
					<td>✓</td>
					<td>&nbsp;</td>
					<td>&nbsp;</td>
					<td>&nbsp;</td>
					<td>✓</td>
					<td>&nbsp;</td>
				</tr>
				<tr>
					<td><span class="codepoint" translate="no"><img alt="FSP" src="images/2007.png"><code class="uname" translate="no">U+2007 FIGURE SPACE</code></span></td>
					<td>✓</td>
					<td>&nbsp;</td>
					<td>&nbsp;</td>
					<td>&nbsp;</td>
					<td>✓</td>
					<td>&nbsp;</td>
				</tr>
				<tr>
					<td><span class="codepoint" translate="no"><img alt="PSP" src="images/2008.png"><code class="uname" translate="no">U+2008 PUNCTUATION SPACE</code></span></td>
					<td>✓</td>
					<td>&nbsp;</td>
					<td>&nbsp;</td>
					<td>&nbsp;</td>
					<td>✓</td>
					<td>&nbsp;</td>
				</tr>
				<tr>
					<td><span class="codepoint" translate="no"><img alt="THSP" src="images/2009.png"><code class="uname" translate="no">U+2009 THIN SPACE</code></span></td>
					<td>✓</td>
					<td>&nbsp;</td>
					<td>&nbsp;</td>
					<td>&nbsp;</td>
					<td>✓</td>
					<td>&nbsp;</td>
				</tr>
				<tr>
					<td><span class="codepoint" translate="no"><img alt="HSP" src="images/200A.png"><code class="uname" translate="no">U+200A HAIR SPACE</code></span></td>
					<td>✓</td>
					<td>&nbsp;</td>
					<td>&nbsp;</td>
					<td>&nbsp;</td>
					<td>✓</td>
					<td>&nbsp;</td>
				</tr>
				<tr>
					<td><span class="codepoint" translate="no"><img alt="LRM" src="images/200E.png"><code class="uname" translate="no">U+200E LEFT-TO-RIGHT MARK</code></span></td>
					<td>&nbsp;</td>
					<td>✓</td>
					<td>&nbsp;</td>
					<td>&nbsp;</td>
					<td>&nbsp;</td>
					<td>&nbsp;</td>
				</tr>
				<tr>
					<td><span class="codepoint" translate="no"><img alt="RLM" src="images/200F.png"><code class="uname" translate="no">U+200F RIGHT-TO-LEFT MARK</code></span></td>
					<td>&nbsp;</td>
					<td>✓</td>
					<td>&nbsp;</td>
					<td>&nbsp;</td>
					<td>&nbsp;</td>
					<td>&nbsp;</td>
				</tr>
				<tr>
					<td><span class="codepoint" translate="no"><img alt="LSEP" src="images/2028.png"><code class="uname" translate="no">U+2028 LINE SEPARATOR</code></span></td>
					<td>✓</td>
					<td>✓</td>
					<td>&nbsp;</td>
					<td>&nbsp;</td>
					<td>&nbsp;</td>
					<td>&nbsp;</td>
				</tr>
				<tr>
					<td><span class="codepoint" translate="no"><img alt="PSEP" src="images/2029.png"><code class="uname" translate="no">U+2029 PARAGRAPH SEPARATOR</code></span></td>
					<td>✓</td>
					<td>✓</td>
					<td>&nbsp;</td>
					<td>&nbsp;</td>
					<td>&nbsp;</td>
					<td>&nbsp;</td>
				</tr>
				<tr>
					<td><span class="codepoint" translate="no"><img alt="NNBSP" src="images/202F.png"><code class="uname" translate="no">U+202F NARROW NO-BREAK SPACE</code></span></td>
					<td>✓</td>
					<td>&nbsp;</td>
					<td>&nbsp;</td>
					<td>&nbsp;</td>
					<td>✓</td>
					<td>&nbsp;</td>
				</tr>
				<tr>
					<td><span class="codepoint" translate="no"><img alt="MMSP" src="images/205F.png"><code class="uname" translate="no">U+205F MEDIUM MATHEMATICAL SPACE</code></span></td>
					<td>✓</td>
					<td>&nbsp;</td>
					<td>&nbsp;</td>
					<td>&nbsp;</td>
					<td>✓</td>
					<td>&nbsp;</td>
				</tr>
				<tr>
					<td><span class="codepoint" translate="no"><img alt="IDSP" src="images/3000.png"><code class="uname" translate="no">U+3000 IDEOGRAPHIC SPACE</code></span></td>
					<td>✓</td>
					<td>&nbsp;</td>
					<td>&nbsp;</td>
					<td>&nbsp;</td>
					<td>✓</td>
					<td>&nbsp;</td>
				</tr>
				<tr>
					<td><span class="codepoint" translate="no"><img alt="ZWNPSP" src="images/FEFF.png"><code class="uname" translate="no">U+FEFF ZERO WIDTH NO-BREAK SPACE</code></span></td>
					<td>&nbsp;</td>
					<td>&nbsp;</td>
					<td>&nbsp;</td>
					<td>&nbsp;</td>
					<td>✓</td>
					<td>&nbsp;</td>
				</tr>
			</tbody>
		</table>

		<p>Some specifications use the same definition as one of the columns above and are not listed in the table. For example, <a href="https://www.w3.org/TR/webdriver/#dfn-whitespace">WebDriver</a> uses the <code class="kw" translate="no">white_space</code> property and <a href="https://www.w3.org/TR/WGSL/#blankspace-and-line-breaks">WebGPU Shading Language</a> uses the <code class="kw" translate="no">pattern_white_space</code> property.</p>
  </section>


<section id="char_ref" class="subtopic">
<h3>Referring to Unicode characters</h3>


<p class="reviewComments"><a href="https://github.com/w3c/i18n-activity/labels/t%3Achar_ref" target="_blank">See related review comments.</a></p>

<aside class="links" id="links_char_ref">
<p class="links_title">Useful background and overviews for this section</p>
<ul>
<li class="w3"><p class="link"><a href="https://www.w3.org/International/i18n-activity/guidelines/editing#codepoints">I18N Editing Guidelines</a> &mdash; more information on character formatting</p></li>
</ul>
</aside>


 	<div class="req" id="char_ref_Uchar">
	<p class="advisement">Use <code translate="no">U+XXXX</code> syntax to represent Unicode code points in a specification.</p>
	</div>

<p>The <code translate="no">U+XXXX</code> format is well understood when referring to Unicode code points in a specification. These are space separated when appearing in a sequence. No additional decoration is needed. Note that a code point may contain four, five, or six hexadecimal digits. When fewer than four digits are needed, the code point number is zero filled.</p>

<aside class="example" title="Code point examples">
	<table>
		<thead>
			<tr>
				<th>Character</th>
				<th>Unicode Name</th>
				<th>U+XXXX syntax</th>
			</tr>
		</thead>
		<tbody>
			<tr>
				<td style="text-align:center">e</td>
				<td class="uname">LATIN SMALL LETTER E</td>
				<td class="uname" style="text-align:right">U+0065</td>
			</tr>
			<tr>
				<td style="text-align:center">&#xe9;</td>
				<td class="uname">LATIN SMALL LETTER E WITH ACUTE</td>
				<td class="uname" style="text-align:right">U+00E9</td>
			</tr>
			<tr>
				<td style="text-align:center">&#xa8a;</td>
				<td class="uname">GUJURATI LETTER UU</td>
				<td class="uname" style="text-align:right">U+0A8A</td>
			</tr>
			<tr>
				<td style="text-align:center">&#xfffd;</td>
				<td class="uname">REPLACEMENT CHARACTER</td>
				<td class="uname" style="text-align:right">U+FFFD</td>
			</tr>
			<tr>
				<td style="text-align:center">&#x1f62e;</td>
				<td class="uname">FACE WITH OPEN MOUTH</td>
				<td class="uname" style="text-align:right">U+1F62E</td>
			</tr>
		</tbody>
	</table>
</aside>

<div class="req" id="char_ref_unicode_names">
	<p class="advisement">Use the Unicode character name to describe specific code points.</p>
</div>

<p>Unicode assigns unique, immutable names to each assigned Unicode code point. Using these names in your specification when referring to specific characters (along with the code point in <code class="uname" translate="no">U+XXXX</code> notation) will help make your specification unambiguous.</p>

<div class="req" id="char_ref_unicode_template">
	<p class="advisement">Use of the character naming template is RECOMMENDED.</p>
</div>

<p>For most characters, the template looks like this:</p>

<code>
&lt;span class="codepoint" translate="no">&lt;bdi lang="??">&#xXXXX;&lt;/bdi>&lt;code class="uname">U+XXXX UNICODE_CHARACTER_NAME_ALL_IN_CAPS&lt;/code>&lt;/span>
</code>

<aside class="example" title="Example of a character reference">
<p>Filling in the above template like this:</p>

<code>
&lt;span class="codepoint" translate="no">&lt;bdi lang="fr">&#x00E9;&lt;/bdi>&lt;code class="uname">U+00E9 LATIN SMALL LETTER E WITH ACUTE&lt;/code>&lt;/span>
</code>

<p>Produces output in the page like this: <span class="codepoint" translate="no"><bdi lang="fr">é</bdi><code class="uname">U+00E9 LATIN SMALL LETTER E WITH ACUTE</code></span>.</p>
</aside>

<p>The <code translate="no" class="kw">bdi</code> element is used to ensure that example characters that are right-to-left do not interfere with the layout of the page. Do not include line breaks or a space between the closing <code translate="no" class="kw">bdi</code> and the following <code translate="no" class="kw">code</code> element; spacing and presentation is controlled by styling.</p>

<p>The <code translate="no" class="kw">lang</code> attribute should be filled in appropriately to get the correct font selection for a given context. Examples in East Asian languages (such as Chinese, Japanese, or Korean) or in the Arabic script can sometimes require greater care in choosing a language tag. Rarely, for certain languages, it might be necessary to adjust the style of the <code>bdi</code> element with a <kbd translate="no">font-family</kbd> and/or <kbd translate="no">font-size</kbd> in your own stylesheet.</p>

<p>For invisible characters (such as control characters), combining characters, or for whitespace, use an image instead of the character; or you may also omit the character and its surrounding <code translate="no" class="kw">bdi</code> element.</p>

<code>
&lt;span class="codepoint" translate="no">&lt;img alt="..." src="...">&lt;code class="uname">U+XXXX UNICODE_CHARACTER_NAME_ALL_IN_CAPS&lt;/code>&lt;/span>
</code>

<aside class="example" title="Example of a character reference with an image">
<p>Filling in the above template like this:</p>

<code>
&lt;span class="codepoint" translate="no">&lt;img alt="NBSP" src="images/00A0.png">&lt;code class="uname">U+00A0 NO-BREAK SPACE&lt;/code>&lt;/span>
</code>

<p>Produces output in the page like this: <span class="codepoint" translate="no"><img alt="NBSP" src="images/00A0.png"><code class="uname">U+00A0 NO-BREAK SPACE</code></span>.</p>
</aside>

<p>Short sequences of characters should list the character names, separated by <span class="codepoint" translate="no">+</span>.

<aside class="example" title="Example of a code point sequence">
	<p>This example: <span class="codepoint" translate="no"><bdi lang="hi">&#x0928&#x093f;</bdi><code class="uname">U+0928 DEVANAGARI LETTER NA</code> + <code class="uname">U+093F  DEVANAGARI VOWEL SIGN I</code></span> uses the following markup:</p>

<code>
&lt;span class="codepoint" translate="no">&lt;bdi lang="hi">&amp;#x0928;&amp;#x093f;&lt;/bdi>&lt;code class="uname">U+0928 DEVANAGARI LETTER NA&lt;/code> + &lt;code class="uname">U+093F DEVANAGARI VOWEL SIGN I&lt;/code>&lt;/span>
</code>	

</aside>

<p>There are cases where including the character name and additional markup is overly pedantic and detracts from usability, but be cautious about being so informal as to impair meaning. In particular, long sequences will sometimes just list the code points, although the character names should be retained where possible for clarity. An example can be found in this document in the <a href="#char_term_def">discussion of the composed "family" emoji</a>: <span class="codepoint"><bdi lang="en">👨‍👩‍👧‍👧</bdi><code class="uname">U+1F468 U+200D U+1F469 U+200D U+1F467 U+200D U+1F467</code></span></p>

</section>
</section>


<section id="char_unicoderef" class="subtopic">
<h3>Referencing the Unicode Standard</h3>

<p class="reviewComments"><a href="https://github.com/w3c/i18n-activity/labels/t%3Achar_unicoderef" target="_blank">See related review comments.</a></p>

<aside class="links" id="links_char_unicoderef">
<p class="links_title">Useful background and overviews for this section</p>
<ul>
<li class="w3"><p class="link"><a href="https://www.w3.org/TR/charmod/#sec-RefUnicode">Referencing the Unicode Standard and ISO/IEC 10646</a>, in <cite>Character Model for the World Wide Web: Fundamentals</cite>.</p></li>
</ul>
</aside>


 	<div class="req" id="char_unicoderef_do">
	<p class="advisement">Since specifications in general need both a definition for their characters and the semantics associated with these characters, specifications SHOULD include a reference to the Unicode Standard, whether or not they include a reference to ISO/IEC 10646.</p>
	<details class="links"><summary>explanations &amp; examples</summary>
	<p><a href="https://www.w3.org/TR/charmod/#sec-RefUnicode">Referencing the Unicode Standard and ISO/IEC 10646, C062</a>, in <cite>Character Model for the World Wide Web: Fundamentals</cite>.</p>
	</details>
	</div>

 	<div class="req" id="char_unicoderef_generic">
	<p class="advisement">A generic reference to the Unicode Standard MUST be made if it is desired that characters allocated after a specification is published are usable with that specification. A specific reference to the Unicode Standard MAY be included to ensure that functionality depending on a particular version is available and will not change over time.</p>
	<details class="links"><summary>explanations &amp; examples</summary>
	<p><a href="https://www.w3.org/TR/charmod/#sec-RefUnicode">Referencing the Unicode Standard and ISO/IEC 10646, C063</a>, in <cite>Character Model for the World Wide Web: Fundamentals</cite>.</p>
	</details>
	</div>

 	<div class="req" id="char_unicoderef_latest">
	<p class="advisement">All generic references to the Unicode Standard MUST refer to the latest version of the Unicode Standard available at the date of publication of the containing specification.</p>
	<details class="links"><summary>explanations &amp; examples</summary>
	<p><a href="https://www.w3.org/TR/charmod/#sec-RefUnicode">Referencing the Unicode Standard and ISO/IEC 10646, C064</a>, in <cite>Character Model for the World Wide Web: Fundamentals</cite>.</p>
	</details>
	</div>

 	<div class="req" id="char_unicoderef_10646">
	<p class="advisement">All generic references to ISO/IEC 10646 MUST refer to the latest version of ISO/IEC 10646 available at the date of publication of the containing specification.</p>
	<details class="links"><summary>explanations &amp; examples</summary>
	<p><a href="https://www.w3.org/TR/charmod/#sec-RefUnicode">Referencing the Unicode Standard and ISO/IEC 10646, C065</a>, in <cite>Character Model for the World Wide Web: Fundamentals</cite>.</p>
	</details>
	</div>
  </section>
  </section>


<section id="operations" class="topic">
<h2>Text-processing</h2>


<div id="operations_checklist" class="summaryC"></div>


<!--p>In this section:</p>
	<ul class="summary">
	<li>[[[#char_indexing]]]</li>
	<li>[[[#string_match]]]</li>
	<li>[[[#text_n11n]]]</li>
	<li>[[[#text_case]]]</li>
	<li>[[[#char_truncation]]]</li>
	<li>[[[#file_naming]]]</li>
	<li>[[[#char_sort]]]</li>
	</ul-->


<section id="char_indexing" class="subtopic">
<h3>Choosing text units for segmentation, indexing, etc.</h3>

<p class="reviewComments"><a href="https://github.com/w3c/i18n-activity/labels/t%3Achar_indexing" target="_blank">See related review comments.</a></p>


<div class="xref"><span class="seealso">See also</span>
<p>[[[#char_string]]].</p>
<p>[[[#char_truncation]]].</p>
</div>


<p>There are many situations where a software process needs to access a substring or to point within a string and does so by the use of indices, i.e. numeric &quot;positions&quot; within a string. Where such indices are exchanged between components of the Web, there is a need for an agreed-upon definition of string indexing in order to ensure consistent behavior. The two main questions that arise are: &quot;What is the unit of counting?&quot; and &quot;Do we start counting at 0 or 1?&quot;.</p>


	<div class="req" id="char_index_char">
	<p class="advisement">The <a class="termref" href="https://www.w3.org/TR/charmod/#def-character-string">character string</a> is RECOMMENDED as a basis for string indexing.</p>
	<details class="links"><summary>more</summary>
	<p>Character Model for the World Wide Web: Fundamentals, <a href="https://www.w3.org/TR/charmod/">String indexing</a>, <a href="https://www.w3.org/TR/charmod/#C051">C051</a></p>
	</details>
	</div>
	  
	<div class="req" id="char_index_grapheme">
	<p class="advisement">Grapheme clusters MAY be used as a basis for string indexing in applications where user interaction is the primary concern.</p>
	<details class="links"><summary>more</summary>
	<p>Character Model for the World Wide Web: Fundamentals, <a href="https://www.w3.org/TR/charmod/">String indexing</a>, <a href="https://www.w3.org/TR/charmod/#C071">C071</a></p>
	<p><a href="https://www.w3.org/International/questions/qa-indic-graphemes">Typographic character units in complex scripts</a> Situations where grapheme clusters can be insufficient for segmenting complex scripts.</p>
	<p>Character encodings: Essential concepts, <a href="https://www.w3.org/International/articles/definitions-characters/index.en#characters">Characters &amp; clusters</a></p>
	</details>
	</div>

	<div class="req" id="char_index_grapheme_plus">
	<p class="advisement">Specifications that define indexing in terms of grapheme clusters MUST either: (a) define grapheme clusters in terms of extended grapheme clusters as defined in <a href="https://unicode.org/reports/tr29/">Unicode Standard Annex #29, Unicode Text Segmentation</a> (UTR #29), or (b) define specifically how tailoring is applied to the indexing operation.</p>
	<details class="links"><summary>more</summary>
	<p>Character Model for the World Wide Web: Fundamentals, <a href="https://www.w3.org/TR/charmod/">String indexing</a>, <a href="https://www.w3.org/TR/charmod/#C071">C071</a></p>
	<p>Unicode Standard Annex #29, Unicode Text Segmentation, <a href="https://unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries">Grapheme Cluster Boundaries</a></p>
	<p><a href="https://www.w3.org/International/questions/qa-indic-graphemes">Typographic character units in complex scripts</a> Situations where grapheme clusters can be insufficient for segmenting complex scripts.</p>
	<p>Character encodings: Essential concepts, <a href="https://www.w3.org/International/articles/definitions-characters/index.en#characters">Characters &amp; clusters</a></p>
	</details>
	</div>


	<div class="req" id="char_index_byte">
	<p class="advisement">The use of <a class="termref" href="https://www.w3.org/TR/charmod/#def-byte-string">byte strings</a> for indexing is NOT RECOMMENDED.</p>
	<details class="links"><summary>more</summary>
	<p><a href="https://www.w3.org/TR/charmod/#C072">Character Model for the World Wide Web: Fundamentals &gt; String indexing</a></p>
	<p>Character Model for the World Wide Web: Fundamentals, <a href="https://www.w3.org/TR/charmod/">String indexing</a>, <a href="https://www.w3.org/TR/charmod/#C072">C072</a></p>
	</details>
	</div>


  	<div class="req" id="char_index_codeunit">
	<p class="advisement">A UTF-16 <a class="termref" href="https://www.w3.org/TR/charmod/#def-physical-string">code unit string</a> is NOT RECOMMENDED as a basis for string indexing, even if this results in a significant improvement in the efficiency of internal operations when compared to the use of character string.</p>
	<details class="links"><summary>more</summary>
	<p>Character Model for the World Wide Web: Fundamentals, <a href="https://www.w3.org/TR/charmod/">String indexing</a>, <a href="https://www.w3.org/TR/charmod/#C052">C052</a></p>
	</details>
	</div>
	<p>A counter-example is the use of UTF-16 in <a href="https://www.w3.org/TR/REC-DOM-Level-1/">DOM Level 1</a>. The use of UTF-16 code points is discouraged because it leaves open the possibility of an index occuring between two surrogate characters, which would cause significant problems (see [[[#char_truncation]]]).</p>


	<div class="req" id="char_index_substrings">
	<p class="advisement">Specifications that need a way to identify substrings or point within a string SHOULD consider ways other than string indexing to perform this operation.</p>
	<details class="links"><summary>more</summary>
	<p>Character Model for the World Wide Web: Fundamentals, <a href="https://www.w3.org/TR/charmod/">String indexing</a>, <a href="https://www.w3.org/TR/charmod/#C053">C053</a></p>
	</details>
	</div>


	<div class="req" id="char_index_counting">
	<p class="advisement">Specifications SHOULD understand and process single characters as substrings, and treat indices as boundary positions <em>between</em> counting units, regardless of the choice of counting units.</p>
	<details class="links"><summary>more</summary>
	<p>Character Model for the World Wide Web: Fundamentals, <a href="https://www.w3.org/TR/charmod/">String indexing</a>, <a href="https://www.w3.org/TR/charmod/#C053">C053</a></p>
	</details>
	</div>


	<div class="req" id="char_index_api">
	<p class="advisement">Specifications of APIs SHOULD NOT specify single characters or single 'units of encoding' as argument or return types.</p>
	<details class="links"><summary>more</summary>
	<p>Character Model for the World Wide Web: Fundamentals, <a href="https://www.w3.org/TR/charmod/">String indexing</a>, <a href="https://www.w3.org/TR/charmod/#C056">C056</a></p>
	</details>
	</div>


	<div class="req" id="char_index_0">
	<p class="advisement">When the positions between the units are counted for string indexing, starting with an index of 0 for the position at the start of the string is the RECOMMENDED solution, with the last index then being equal to the number of counting units in the string.</p>
	<details class="links"><summary>more</summary>
	<p>Character Model for the World Wide Web: Fundamentals, <a href="https://www.w3.org/TR/charmod/">String indexing</a>, <a href="https://www.w3.org/TR/charmod/#C057">C057</a></p>
	</details>
	</div>
</section>


<section id="string_match" class="subtopic">
<h3>Matching string identity for identifiers and syntactic content</h3>

<p class="reviewComments"><a href="https://github.com/w3c/i18n-activity/labels/t%3Astring_match" target="_blank">See related review comments.</a></p>


<aside class="links" id="links_">
<p class="links_title">Useful background and overviews for this section</p>
<ul>
<li class="w3"><p class="link"><a href="https://www.w3.org/TR/charmod-norm/">Character Model for the World Wide Web: String Matching</a>.</p></li>
<li class="w3"><p class="link"><a href="https://www.w3.org/TR/charmod-norm/#identityMatching">String Matching of Syntactic Content in Document Formats and Protocols</a>, in <cite>Character Model for the World Wide Web: Fundamentals</cite>.</p></li>
</ul>
</aside>
<div class="xref"><span class="seealso">See also</span>
<p>[[[#text_n11n]]].</p>
<p>[[[#text_case]]].</p>
</div>


	<div class="req" id="string_match_steps">
	<p class="advisement">String identity matching for identifiers and syntactic content should involve the following steps: (a) Ensure the strings to be compared constitute a sequence of Unicode code points (b) Expand all character escapes and includes (c) Perform any appropriate case-folding and Unicode normalization step (d) Perform any additional matching tailoring specific to the specification, and (e) Compare the resulting sequences of code points for identity.</p>
	<details class="links"><summary>explanations &amp; examples</summary>
	<p><a href="https://www.w3.org/TR/charmod-norm/#matchingAlgorithm">The Matching Algorithm</a>, in Character Model for the World Wide Web: String Matching</p>
	</details>
	</div>

	<div class="req" id="string_match_default">
	<p class="advisement">The default recommendation for matching strings in identifiers and syntactic content is to do no normalization (ie. case folding or Unicode Normalization) of content.</p>
	<details class="links"><summary>explanations &amp; examples</summary>
	<p><a href="https://www.w3.org/TR/charmod-norm/#performNorm">Performing the Appropriate Normalization Step</a>, in Character Model for the World Wide Web: String Matching</p>
	</details>
	</div>

	<div class="req" id="string_match_other_normalization">
	<p class="advisement"><a class="termref" href="https://www.w3.org/TR/charmod-norm/#ASCIIFoldNormalizationStep">'ASCII case fold'</a> and <a href="https://www.w3.org/TR/charmod-norm/#CanonicalFoldNormalizationStep" class="termref">'Unicode canonical case fold'</a> approaches should only be used in special circumstances.</p>
	<details class="links"><summary>explanations &amp; examples</summary>
	<p><a href="https://www.w3.org/TR/charmod-norm/#performNorm">Performing the Appropriate Normalization Step</a>, in Character Model for the World Wide Web: String Matching</p>
	</details>
	</div>

	<div class="req" id="string_match_compatibility">
	<p class="advisement">A <a href="https://www.w3.org/TR/charmod-norm/#CompatibilityFoldNormalizationStep" class="termref">'Unicode compatibility case fold'</a> approach should not be used.</p>
	<details class="links"><summary>explanations &amp; examples</summary>
	<p><a href="https://www.w3.org/TR/charmod-norm/#performNorm">Performing the Appropriate Normalization Step</a>, in Character Model for the World Wide Web: String Matching</p>
	</details>
	</div>

	<div class="req" id="string_match_boundaries">
	<p class="advisement">Specifications of vocabularies MUST define the boundaries between syntactic content and character data as well as entity boundaries (if the language has any include mechanism).</p>
	<details class="links"><summary>explanations &amp; examples</summary>
	<p><a href="https://www.w3.org/TR/charmod-norm/#normalizationChoice">Additional Considerations for Normalization</a>, in Character Model for the World Wide Web: String Matching</p>
	</details>
	</div>
</section>


<section id="text_n11n" class="subtopic">
<h3>Working with Unicode Normalization</h3>

<p class="reviewComments"><a href="https://github.com/w3c/i18n-activity/labels/t%3Atext_n11n" target="_blank">See related review comments.</a></p>


<aside class="links" id="links_text_n11n">
<p class="links_title">Useful background and overviews for this section</p>
<ul>
<li class="w3">Character Model for the World Wide Web: String Matching
	<ul>
	<li><p class="link"><a href="https://www.w3.org/TR/charmod-norm/#unicodeNormalization">Unicode Normalization</a>.</p></li>
	<li><p class="link"><a href="https://www.w3.org/TR/charmod-norm/#normalizationLimitations">Identical-Appearing Characters and the Limitations of Normalization</a>.</p></li>
	<li><p class="link"><a href="https://www.w3.org/TR/charmod-norm/#normalizationAndCasefold">Interaction of Normalization and Case Folding</a>.</p></li>
	<li><p class="link"><a href="https://www.w3.org/TR/charmod-norm/#performNorm">Performing the Appropriate Normalization Step</a>.</p></li>
	<li><p class="link"><a href="https://www.w3.org/TR/charmod-norm/#normalizationChoice">Additional Considerations for Normalization</a>.</p></li>
	</ul>
</li>
</ul>
</aside>


	<div class="req" id="text_n11n_default">
	<p class="advisement">Specifications SHOULD NOT specify a Unicode normalization form for encoding, storage, or interchange of a given vocabulary.</p>
	<details class="links"><summary>explanations &amp; examples</summary>
	<p><a href="https://www.w3.org/TR/charmod-norm/#normalizationChoice">Additional Considerations for Normalization</a>, in Character Model for the World Wide Web: String Matching.</p>
	</details>
	</div>

	<div class="req" id="text_n11n_no_change">
	<p class="advisement">Implementations MUST NOT alter the normalization form of textual data being exchanged, read, parsed, or processed except when required to do so as a side-effect of text transformation such as transcoding the content to a Unicode character encoding, case folding, or other user-initiated change, as consumers or the content itself might depend on the de-normalized representation.</p>
	<details class="links"><summary>explanations &amp; examples</summary>
	<p><a href="https://www.w3.org/TR/charmod-norm/#normalizationChoice">Additional Considerations for Normalization</a>, in Character Model for the World Wide Web: String Matching.</p>
	</details>
	</div>

	<div class="req" id="text_n11n_compatibility">
	<p class="advisement">Specifications SHOULD NOT specify compatibility normalization forms (NFKC, NFKD).</p>
	<details class="links"><summary>explanations &amp; examples</summary>
	<p><a href="https://www.w3.org/TR/charmod-norm/#normalizationChoice">Additional Considerations for Normalization</a>, in Character Model for the World Wide Web: String Matching.</p>
	</details>
	</div>

	<div class="req" id="text_n11n_security">
	<p class="advisement">Specifications MUST document or provide a health-warning if canonically equivalent but disjoint Unicode character sequences represent a security issue.</p>
	<details class="links"><summary>explanations &amp; examples</summary>
	<p><a href="https://www.w3.org/TR/charmod-norm/#normalizationChoice">Additional Considerations for Normalization</a>, in Character Model for the World Wide Web: String Matching.</p>
	</details>
	</div>

	<div class="req" id="text_n11n_operations">
	<p class="advisement">Where operations can produce denormalized output from normalized text input, specifications MUST define whether the resulting output is required to be normalized or not. Specifications MAY state that performing normalization is optional for some operations; in this case the default SHOULD be that normalization is performed, and an explicit option SHOULD be used to switch normalization off.</p>
	<details class="links"><summary>explanations &amp; examples</summary>
	<p><a href="https://www.w3.org/TR/charmod-norm/#normalizing-spec">Requirements When Specifying Normalization in Document Formats</a>, in Character Model for the World Wide Web: String Matching.</p>
	</details>
	</div>

	<div class="req" id="text_n11n_implementation">
	<p class="advisement">Specifications that require normalization MUST NOT make the implementation of normalization optional.</p>
	<details class="links"><summary>explanations &amp; examples</summary>
	<p><a href="https://www.w3.org/TR/charmod-norm/#normalizing-spec">Requirements When Specifying Normalization in Document Formats</a>, in Character Model for the World Wide Web: String Matching.</p>
	</details>
	</div>

	<div class="req" id="text_n11n_sensitive_operations">
	<p class="advisement">Normalization-sensitive operations MUST NOT be performed unless the implementation has first either confirmed through inspection that the text is in normalized form or it has re-normalized the text itself. Private agreements MAY be created within private systems which are not subject to these rules, but any externally observable results MUST be the same as if the rules had been obeyed.</p>
	<details class="links"><summary>explanations &amp; examples</summary>
	<p><a href="https://www.w3.org/TR/charmod-norm/#normalizing-spec">Requirements When Specifying Normalization in Document Formats</a>, in Character Model for the World Wide Web: String Matching.</p>
	</details>
	</div>

	<div class="req" id="text_n11n_mechanism">
	<p class="advisement">A normalizing text-processing component which modifies text and performs normalization-sensitive operations MUST behave as if normalization took place after each modification, so that any subsequent normalization-sensitive operations always behave as if they were dealing with normalized text.</p>
	<details class="links"><summary>explanations &amp; examples</summary>
	<p><a href="https://www.w3.org/TR/charmod-norm/#normalizing-spec">Requirements When Specifying Normalization in Document Formats</a>, in Character Model for the World Wide Web: String Matching.</p>
	</details>
	</div>


<section id="spec_n11n">
<h4>Specifying Unicode Normalization</h4>
	
	<div class="req" id="text_n11n_specification">
	<p class="advisement">Specifications that perform comparison or matching of string values SHOULD specify the appropriate note or warning regarding Unicode normalization.</p>
	</div>
    
    <p>The use or adoption of Unicode Normalization in a specification is usually part of defining how <a href="#string_match_steps">matching</a> takes place in a given format or protocol. To help specification authors and implementers understand some of the complexity involved, the Internationalization Working Group has developed a document describing the considerations for the matching and comparison of strings: <cite class="link"><a href="https://www.w3.org/TR/charmod-norm/">Character Model for the World Wide Web: String Matching</a></cite> [[CHARMOD-NORM]].</p>
    
    <p>One of the choices specifications need to make is whether (or not) to require Unicode Normalization as part of matching various "values" defined as part of the specification's vocabulary. Values are commonly part of a document format or protocol's syntax, and include such things as: attribute names or values, element names or values, IDs, and so forth. Specifications that follow the <a href="#text_n11n_default">recommendation</a> to <em>not</em> employ normalization as part of matching should include the following Note as a reminder to content authors.</p>
        
    <div class="example" id="n11n_note_figure">
	   <p>Example note. Necessarily this version is non-specific about what constitutes "values": specifications may wish to be more specific.</p>
	   <div class="example_div">
           <p class="note example_note">This specification does not permit Unicode normalization of values for the purposes of comparison. Values that are visually and semantically identical but use different Unicode character sequences will not match. Content authors are advised to use the same encoding sequence consistently or to avoid potentially troublesome characters when choosing values. For more information, see [[CHARMOD-NORM]].</p>
       </div>
    </div>
    
    <p>Specifications that choose to require require normalization as part of string matching should include the following warning:</p>
    
    <div class="example" id="n11n_warning_figure">
     <p>Example warning. Necessarily this version is non-specific about what constitutes "values": specifications may wish to be more specific.</p>
     
       <div class="example_div">
         <p class="warning">This specification applies Unicode normalization during the matching of values. This can have an effect on the appearance and meaning of the affected text. For more information, see [[CHARMOD-NORM]].</p> 
       </div>
    </div>
    
    <p>Contact the I18N WG for alternatives or assistance if the above do not meet your needs or you're not sure about usage.</p>
    
  </section>
</section>


<section id="text_case" class="subtopic">
<h3>Case folding</h3>

<p class="reviewComments"><a href="https://github.com/w3c/i18n-activity/labels/t%3Atext_case" target="_blank">See related review comments.</a></p>

<aside class="links" id="links_text_case">
<p class="links_title">Useful background and overviews for this section</p>
<ul>
<li class="w3">Character Model for the World Wide Web: String Matching
	<ul>
	<li><p class="link"><a href="https://www.w3.org/TR/charmod-norm/#definitionCaseFolding">Case Mapping and Case Folding</a>.</p></li>
	<li><p class="link"><a href="https://www.w3.org/TR/charmod-norm/#handlingCaseFolding">Additional Considerations for Case Folding</a>.</p></li>
	</ul>
</li>
</ul>
</aside>


	<div class="req" id="text_case_options">
	<p class="advisement">Specifications and implementations that define string matching as part of the definition of a format, protocol, or formal language (which might include operations such as parsing, matching, tokenizing, etc.) MUST define the criteria and matching forms used. These MUST be one of: (a) case-sensitive (b) Unicode case-insensitive using Unicode full case-folding (c) ASCII case-insensitive.</p>
	</div>

	<div class="req" id="text_case_sensitive">
	<p class="advisement">Case-sensitive matching is RECOMMENDED for matching syntactic content, including user-defined values.</p>
	<details class="links"><summary>explanations &amp; examples</summary>
	<p><a href="https://www.w3.org/TR/charmod-norm/#sec_case_sensitive">Case-sensitive matching</a>, in Character Model for the World Wide Web: String Matching.</p>
	</details>
	</div>

	<div class="req" id="text_case_unicode_full">
	<p class="advisement">Specifications that define case-insensitive matching in vocabularies that include more than the Basic Latin (ASCII) range of Unicode MUST specify Unicode full casefold matching.</p>
	<details class="links"><summary>explanations &amp; examples</summary>
	<p><a href="https://www.w3.org/TR/charmod-norm/#sec_unicode_cs">Unicode case-insensitive matching</a>, in Character Model for the World Wide Web: String Matching.</p>
	</details>
	</div>

	<div class="req" id="text_case_asciici">
	<p class="advisement">Specifications that define case-insensitive matching in vocabularies limited to the Basic Latin (ASCII) subset of Unicode MAY specify ASCII case-insensitive matching.</p>
	<details class="links"><summary>explanations &amp; examples</summary>
	<p><a href="https://www.w3.org/TR/charmod-norm/#sec_ascii_cs">ASCII case-insensitive matching</a>, in Character Model for the World Wide Web: String Matching.</p>
	</details>
	</div>

	<div class="req" id="text_case_nonascii">
	<p class="advisement">If language-sensitive case-sensitive matching is specified, Unicode case mappings SHOULD be tailored according to language and the source of the language used for each tailoring MUST be specified.</p>
	<details class="links"><summary>explanations &amp; examples</summary>
	<p><a href="https://www.w3.org/TR/charmod-norm/#sec_language_tailoring">Language-specific tailoring</a>, in Character Model for the World Wide Web: String Matching.</p>
	</details>
	</div>

	<div class="req" id="text_case_vocabularies">
	<p class="advisement">Specifications that define case-insensitive matching in vocabularies SHOULD NOT specify language-sensitive case-insensitive matching.</p>
	<details class="links"><summary>explanations &amp; examples</summary>
	<p><a href="https://www.w3.org/TR/charmod-norm/#sec_language_tailoring">Language-specific tailoring</a>, in Character Model for the World Wide Web: String Matching.</p>
	</details>
	</div>
</section>


<section id="char_truncation" class="subtopic">
<h3>Truncating or limiting the length of strings</h3>

<p class="reviewComments"><a href="https://github.com/w3c/i18n-activity/labels/t%3Achar_truncation" target="_blank">See related review comments.</a></p>

<aside class="links" id="links_char_truncation">
<p class="links_title">Useful background and overviews for this section</p>
<ul>
<li><p class="w3c"><a href="https://www.w3.org/TR/charmod/#sec-CollationUnits">Units of collation</a>, in <cite>Character Model for the World Wide Web: Fundamentals</cite>.</p></li>
</ul>
</aside>


<p>Some specifications, formats, or protocols or their implementations need to specify limits for the size of a given data structure or text field. This could be due to many reasons, such as limits on processing, memory, data structure size, and so forth. When selecting or specifying limits on the length of a given string, specifications or implementations need to ensure that they do not cause corruption in the text.</p>

	<div class="req" id="char_trunc_no_limit">
	<p class="advisement">Specifications SHOULD NOT limit the size of data fields unless there is a specific practical or technical limitation.</p>
	</div>

    <p>There are many reasons why a length limit might be needed in a specification or format. Generally length limits correspond to underlying limits in the implementation, such as the use of fixed-size fields in a database or data store, the desire to fit into practical boundaries such as packet size, or some other implementation detail related to storage allocation or efficiency.</p>

    <p>When truncating strings, it's necessary to decide what units to use when counting the size of the string. In many cases this is beyond the control of the specification, since the truncation is occuring for some preordained reason. However, when the choice is available, some general guidelines can be applied.</p>

    <p>If the limitation is related to the number of display positions, the grapheme count usually corresponds most closely to the expected limit. Note that proportional width fonts, combining marks, complex scripts, and many other factors complicate counting "screen positions". In Web pages, for example, the CSS <kbd>text-overflow</kbd> property provides visual truncation without disturbing the content of the text. Attempts to estimate the size of a given piece of text based on the number of Unicode code points or even the number of grapheme clusters is mostly futile.</p>

    <p>Otherwise most limits are expressed in terms of code points in Unicode or code units (such as bytes) in a specific character encoding. Code points provides the best user experience, since all Unicode code points are treated identically: if text is truncated after 40 code points, all languages and scripts get the same number of code points to work with. By contrast, when the size limit is expressed in code units such as bytes in UTF-8, users who write in a language that mostly uses ASCII letters get many more characters (code points) for a given size limit than user's whose language is mostly made up of characters that take 2-, 3-, or 4-bytes per code point.</p>

    <aside class="example">
		<h3>Text truncation in UTF-8</h3>

        <p>Below you can see the effect of truncating a given string of text encoded in UTF-8 on a 40-byte boundary. There are several things to notice here.</p>
		<p>First, the number of characters in the truncated string decreases as the number of bytes required per character goes up. So the Cyrillic string has half the number of characters as the ASCII string. The Chinese string has about 1/3 the number. And the emoji string has 1/4.</p>
		<p>Second, in two of the three examples, the text is truncated on a byte boundary in the middle of a character. The resulting "dangling byte" is rendered as U+FFFD and the byte sequence itself is not valid UTF-8. This can interfere with the validity of a given text file. Unlike many legacy character encodings, UTF-8 is highly patterned, so the the longest broken character sequence that can result from mid-character truncation is one character. By contrast, in many legacy encodings, a file or document containing a mid-character truncated string can be wholly changed or rendered unintelligible after that point.</p>
		<table class=truncExample>
		<tr>
			<th>Script</th>
			<th>Truncated Length<br>(code points)</th>
			<th>Avg. Bytes/Code Point</th>
			<th>Truncated Text<br>Byte Values</th>
		</tr>
		<tr>
			<td rowspan=2>ASCII
			<td rowspan=2>40</td>
			<td rowspan=2>1</td>
			<td><kbd>In the loveliest town of all, where the </kbd></td>
		</tr>
		<tr>
			<td><code>49 6E 20 74 68 65 20 6C 6F 76 65 6C 69 65 73 74 20 74 6F 77 6E 20 6F 66 20 61 6C 6C 2C 20 77 68 65 72 65 20 74 68 65 20</code></td>
		</tr>
		<tr>
			<td rowspan=2>Cyrillic</td>
			<td rowspan=2>22</td>
			<td rowspan=2>2</td>
			<td><kbd>В самом прекрасном го�</kbd></td>
	    </tr>
	    <tr>
			<td><code>D0 92 20 D1 81 D0 B0 D0 BC D0 BE D0 BC 20 D0 BF D1 80 D0 B5 D0 BA D1 80 D0 B0 D1 81 D0 BD D0 BE D0 BC 20 D0 B3 D0 BE <span style="color:red">D0</span></code></td>
	    </tr>
		<tr>
			<td rowspan=2>Han</td>
			<td rowspan=2>14</td>
			<td rowspan=2>3</td>
			<td><kbd lang="zh">在最美丽的城镇，那里的房屋�</kbd></td>
		</tr>
		<tr>
			<td><code>E5 9C A8 E6 9C 80 E7 BE 8E E4 B8 BD E7 9A 84 E5 9F 8E E9 95 87 EF BC 8C E9 82 A3 E9 87 8C E7 9A 84 E6 88 BF E5 B1 8B <span style="color:red">E5</span></code></td>
	    </tr>
		<tr>
			<td rowspan=2>Emoji</td>
			<td rowspan=2>10
			<td rowspan=2>4</td>
			<td><kbd>🙊🙁😢😠😧😎😽😉😄😮</kbd></td>
		</tr>
		<tr>
			<td><code>F0 9F 99 8A F0 9F 99 81 F0 9F 98 A2 F0 9F 98 A0 F0 9F 98 A7 F0 9F 98 8E F0 9F 98 BD F0 9F 98 89 F0 9F 98 84 F0 9F 98 AE</code></td>
		</tr>
		</table>
    </aside>


	<div class="req" id="char_trunc_units">
	<p class="advisement">Specifications that limit the length of a string MUST specify which type of unit (extended grapheme clusters, Unicode code points, or code units) the length limit uses.</p>
	</div>

	<div class="req" id="char_trunc_unit_rec">
	<p class="advisement">Specifications that limit the length of a string SHOULD specify the length in terms of Unicode code points.</p>
	</div>

	<div class="req" id="char_trunc_byte_boundary">
	<p class="advisement">If a specification sets a length limit in code units (such as bytes), it MUST specify that truncation can only occur on code point boundaries.</p>
	</div>

	
	<p>Note that this best practice applies equally to specifications based on UTF-16, which uses 16-bit code units, not just to multibyte encodings such as UTF-8.</p>
    	<p>Specifications or APIs that interact with the [[DOM]] need to contend with the fact that character data, including operations such as <kbd>length</kbd>, <kbd>substringData</kbd>, <kbd>insertData</kbd>, <kbd>deleteData</kbd>, and so forth, is specified using UTF-16 code units, not Unicode code points. This can lead to inappropriate mid-character (code point) truncation. Specifications that reference DOM should specify that string operations not occur inside code points, and, where appropriate avoid starting or ending inside grapheme boundaries. Specifications should also include a health warning for implementers and users.</p>
    	<div class=example>
		<p>Example warning. Modify this health warning as appropriate for your specification:</p>
		<div class="example_div">
		<p class="warning">Arbitrary index values in the DOM may not fall on character or grapheme boundaries. Implementations and users should avoid incorrectly starting or ending operations in the middle of a user-perceived character sequence.</p>
		</div>
    	</div>

	<div class="req" id="char_trunc_grapheme_boundary">
	<p class="advisement">Specifications that limit the length of a string SHOULD require truncation on grapheme boundaries, as truncation in the midst of a <a>grapheme</a> or <a>combining character sequence</a> can alter the meaning of the string.</p>
	</div>

	<div class="req" id="char_trunc_indicator">
	<p class="advisement">If a specification specifies a length limit, it SHOULD specify that any string that is truncated include an indicator, such as ellipses, that the string has been altered.</p>
	</div>

	<div class="req" id="char_trunc_min_size">
	<p class="advisement">When specifying a length limitation in code units (such as bytes), specifications SHOULD set the limit in a way that accommodates users whose language requires multibyte code unit sequences.</p>
	</div>
	
	<div class="req" id="char_trunc_character_encoding">
		<p class="advisement">If a specification specifies a length limit in code units (such as bytes), it MUST specify the <a>character encoding</a> used in measuring the limit; such a limit SHOULD NOT specify a <a>legacy character encoding</a>.</p>
	</div>
	
	<p>If a specification permits or requires truncation of a field, the <a>character encoding</a> is important in knowing what the limit means. If the limit is in bytes and <a>legacy character encodings</a> are permitted, note that conversion of Unicode data to a non-Unicode encoding can also result in data loss (since most <a>legacy character encodings</a> encode only a subset of Unicode).</p>
</section>

<section id="strcat" class="subtopic">
<h3>Concatenation of strings</h3>

<p class="reviewComments"><a href="https://github.com/w3c/i18n-activity/labels/t%3Astrcat" target="_blank">See related review comments.</a></p>

<div class="req" id="strcat_anti_pattern">
	<p class="advisement">Specifications SHOULD NOT require the concatenation of string values to form [=natural language=] or displayable string values.</p>
</div>

<p>Creation of [=natural language=] text values by concatenating multiple strings together is an internationalization anti-pattern. Languages vary greatly in word order, count, grammatical gender or case, punctuation, and many other requirements. As a result, avoid requiring or suggesting that implementations generate human-readable messages from sub-strings.</p>

<div class="xref"><span class="seealso">See also</span>
   <p><cite><a href="https://www.w3.org/International/articles/composite-messages/index.en.html">Working with Composite Messages</a></cite></p>
</div>

<div class="req" id="bidi_gen_advice">
	<p class="advisement">When a specification requires an implementation to create or generate text which will be displayed to users, the specification SHOULD provide implementers with guidance on how to avoid potential problems related to text direction.</p>
</div>

<p>Specifications for APIs, protocols, or document formats sometimes require an implementation to create or provide a field containing a display name or description. When such a string is assembled from separate parts, it can result in problems with presentation or understanding due to the way that the <cite>Unicode Bidirectional Algorithm</cite> [[UAX9]] processes the assembled string. In such cases, the specification should guide implementers about how to create values that will display properly.</p>

<aside class="example" title="Example of bidi generation guidance">
	<p>Here is an example of a Note that you can include in your specification when recommending that implementations generate displayable [=natural language=] values:</p>
	<div class="note" role="note" id="bidi-gen-note-example">
	   <p class="example_note">When generating a value for <em><code>_field_name_</code></em>, [=producers=] need to ensure that directional changes <em>inside</em> the string will display correctly. When a [=producer=] assembles the value from multiple strings, individual substrings might need to be [=bidi isolated=] in order to prevent [=spillover=] effects. This can be done using markup (where available) or via the insertion of Unicode bidirectional controls. In addition, if the [=producer=] can supply the correct base direction as metadata, it allows the [=consumer=] to both isolate the string and set its direction, avoiding [=spillover=] effects between the full string and the context in which it is eventually displayed.  For more information see: <a href="https://www.w3.org/International/questions/qa-bidi-unicode-controls"><cite>How to use Unicode controls for bidi text</cite></a> and <a href="https://www.w3.org/TR/international-specs/#inline_changes">Inline changes to base direction</a> [[INTERNATIONAL-SPECS]].</p>
	</div>
</aside>

<div class="xref"><span class="seealso">See also</span>
<p>[[[#inline_changes]]]</p>
<p>[[[#bidi_inline]]]</p>
</div>

</section>

<section id="file_naming" class="subtopic">
<h3>Working with file and path names</h3>

<p>Some specifications need to define how file names or file paths are constructed by various implementations. One challenge is building definitions that work consistently when used on the different file systems used by different operating systems. This section contains general guidance when defining restrictions on file names or file paths. It is based on requirements developed in [[EPUB-33]], as well as implementation experience.</p>

<div class="req" id="file_name_utf8">
	<p class="advisement">Specify the UTF-8 [[Unicode]] encoding for the storage and processing of file names and file paths.</p>
	<p></p>
</div>

<div class="req" id="file_name_length">
	<p class="advisement">File names SHOULD be restricted to 255 bytes in length.</p>
	<p>This restriction is related to limitations found in certain file systems, originally MS-DOS, but also certain Unix file systems&mdash;as well as packaging schemes such as PKZIP that depend on these file systems or subsumed their limitations&mdash;in which the limit for a specific "path element" (including directory names) is limited to 255 bytes.</p>
</div>

<div class="req" id="path_name_length">
	<p class="advisement">Path names SHOULD be restricted to 65535 bytes in length.</p>
	<p>This restriction is related to limitations found in file systems such as FAT32 or NTFS, which restrict the path length to 32760 (32K) code units in the UTF-16 character encoding. Each UTF-16 code unit takes 16 bits (or 2 bytes), making the limit 65,535 when measured in bytes. Note that a path name limited to 64K <em>bytes</em> in UTF-8 can exceed the path length limits on these file systems, since UTF-8 is a variable width encoding.</p>
</div>

<div class="req" id="file_name_char_restrictions">
	<p class="advisement">File name and path name definitions MUST NOT use the following Unicode code points.</p>
	<p>These characters are known to cause interoperability problems with various file systems. Specifications and implementations should use an abundance of caution in their file naming when interoperability of content is key. The list of restricted characters is intended to help avoid some known problem areas, but it does not ensure that all other Unicode characters are supported.</p>
	<ul>
		<li><span class="codepoint"><bdi>"</bdi><code class="uname">U+0022 QUOTATION MARK</code></span></li>
		<li><span class="codepoint"><bdi>*</bdi><code class="uname">U+002A ASTERISK</code></span></li>
		<li><span class="codepoint"><bdi>/</bdi><code class="uname">U+002F SOLIDUS</code></span></li>
		<li><span class="codepoint"><bdi>:</bdi><code class="uname">U+003A COLON</code></span></li>
		<li><span class="codepoint"><bdi>&lt;</bdi><code class="uname">U+003C LESS-THAN SIGN</code></span></li>
		<li><span class="codepoint"><bdi>&gt;</bdi><code class="uname">U+003E GREATER-THAN SIGN</code></span></li>
		<li><span class="codepoint"><bdi>\</bdi><code class="uname">U+005C REVERSE SOLIDUS</code></span></li>
		<li><span class="codepoint"><bdi>|</bdi><code class="uname">U+007C VERTICAL LINE</code></span></li>
		<li><span class="codepoint"><img alt="DEL" src="images/007F.png"><code class="uname">U+007F DEL</code></span></li>
		<li>Codepoints in the following ranges:
		   <ul>
			   <li>C0 Controls <code class="uname">U+0000</code>...<code class="uname">U+001F</code></li>
			   <li>C1 Controls <code class="uname">U+0080</code>...<code class="uname">U+009F</code></li>
			   <li>Private Use <code class="uname">U+E000</code>...<code class="uname">U+F8FF</code></li>
			   <li>Specials <code class="uname">U+FFF0</code>...<code class="uname">U+FFFF</code></li>
			   <li>Supplementary Private Use <code class="uname">U+F0000</code>...<code class="uname">U+FFFFF</code></li>
			   <li>Supplementary Private Use <code class="uname">U+100000</code>...<code class="uname">U+10FFFF</code></li>
		   </ul></li>

		<li><span class="codepoint"><bdi>.</bdi><code class="uname">U+002E FULL STOP</code></span> as the last character (Note that this includes the file names <code>.</code> and <code>..</code>, which have special meaning to many file systems)</li><!-- break out -->

		<li>All Unicode non-character code points, specifically:
		<ul>
			<li>The 32 contiguous characters in the Basic Multilingual Plane (U+FDD0 … U+FDEF)</li>
			<li>The last two code points of the Basic Multilingual Plane (U+FFFE and U+FFFF)</li>
			<li>The last two code points at the end of the Supplementary Planes (U+1FFFE, U+1FFFF … U+EFFFE, U+EFFFF)</li>
		</ul></li>

		<li>All Unicode <a href="https://www.unicode.org/Public/UCD/latest/ucd/PropList.txt">deprecated characters</a> (search for "Deprecated" in the file).</li>

	</ul>
</div>

</section>

<section id="char_sort" class="subtopic">
<h3>Specifying sort and search functionality</h3>

<p class="reviewComments"><a href="https://github.com/w3c/i18n-activity/labels/t%3Achar_sort" target="_blank">See related review comments.</a></p>

<aside class="links" id="links_char_sort">
<p class="links_title">Useful background and overviews for this section</p>
<ul>
	<li><p class="link">Unicode's <a href="https://github.com/unicode-org/icu/blob/main/docs/userguide/collation/index.md">ICU Collation User Guide</a></p></li>
	<li><p class="link"><a href="https://unicode.org/reports/tr10">Unicode Collation Algorithm</a> [[UTS10]]</p></li>
    <li class="w3"><p class="link"><a href="https://www.w3.org/TR/charmod/#sec-CollationUnits">Units of collation</a>, in <cite>Character Model for the World Wide Web: Fundamentals</cite>.</p></li>
</ul>
</aside>

    <p>Applications often need to organize sets of information or content. Frequently this involves sorting the content. Many non-textual data types, such as numbers or dates, can be easily sorted using the internal data representation. When it comes to textual information, however, the nature of character encodings and user expectations regarding "alphabetical" order brings some additional complexity.</p>
    
    <p>One key choice is whether the sorting of textual data will be strictly internal or whether the results will be shown to users.</p>
    
    <section id="internal_sort" class="subtopic">
    <h4>Program Internal Sorting</h4>
    
    <div class="req" id="char_sort_internal_only">
	<p class="advisement">Specifications or implementations that require a program-internal, fast, and deterministic sorting of text which is not intended for human viewing or interaction SHOULD specify that strings are sorted according to their definition of string. For scalar value strings (such as <a href="https://webidl.spec.whatwg.org/#idl-USVString">USVString</a> or many XML processes), specify <em>ascending code point</em> order. For string types based on UTF-16 (such as <a href="https://webidl.spec.whatwg.org/#idl-DOMString">DOMString</a> or in many JavaScript APIs), specify <em>ascending code unit</em> order. </p>
	<details class="links"><summary>explanations &amp; examples</summary>
	   <p>See: <a href="#char_string">Defining 'string'</a>
	   <p><a href="https://www.w3.org/TR/xpath-functions/#codepoint-collation">Unicode Codepoint Collation</a> is defined as an XPath Function [[xpath-functions]]</p>
	</details>
    </div>
    
    <p>There are two potential internal sorting sequences: ordering by Unicode [=code point=] or ordering by UTF-16 [=code unit=]. For either type of ordering, the resulting list will not match any particular alphabetic or lexicographical order.</p>
    
    <p>Sorting by [=code point=] makes sense when strings are stored and processed as a sequence of code points, such as in a <a href="https://webidl.spec.whatwg.org/#idl-USVString">USVString</a>. Sorting by [=code unit=] makes sense when strings are stored and processed using the underlying encoding, such as in a <a href="https://webidl.spec.whatwg.org/#idl-DOMString">DOMString</a>.</p>
    
    <aside class="example" title="Code point vs. code unit ordering">
		<p>Consider two strings, one containing <span class="codepoint" translate="no"><bdi lang="ja">&#x1f63a;</bdi><code class="uname">U+1F63A SMILING CAT FACE WITH OPEN MOUTH</code></span> and the other containing <span class="codepoint" translate="no"><bdi lang="ja">&#xff5e;</bdi><code class="uname">U+FF5E FULL WIDTH TILDE</code></span>.</p>
		
		<p>In ascending <em>code point order</em>, the strings sort like:</p>
<pre>
&#xff5e; (U+FF5E)
&#x1f63a; (U+1F63A)
</pre>
		
		<p>In ascending <em>code unit order</em> in the UTF-16 character encoding, the code point U+1F63A is encoded as the code unit sequence <code>0xD83D 0xDE3A</code> (a [=surrogate pair=]), so the strings sort like:</p>	
<pre>
&#x1f63a; (0xD83D 0xDE3A)
&#xff5e; (0xFF5E)
</pre>
       <p>Note that UTF-8 <em>code unit order</em> (that is, when sorting by byte values in UTF-8 encoded byte strings) is the same as code point order.</p>
    </aside>
    
    <p>Neither of these sort orders applies any type of normalization to the strings being compared. This means that some apparently equivalent strings compare as different. See <cite>String Matching</cite> [[CHARMOD-NORM]] for more information.</p>
    
    <aside class="example">
		<p>Consider two strings, <em>A</em> and <em>B</em>. String A consists of <span class="codepoint" translate="no"><bdi lang="en">&#xC7;</bdi><code class="uname">U+00C7 LATIN CAPITAL LETTER C WITH CEDILLA</code></span>. String B consists of the sequence <span class="codepoint" translate="no"><bdi lang="en">&#x43;</bdi><code class="uname">U+0043 LATIN CAPITAL LETTER C</code></span> followed by <span class="codepoint" translate="no"><bdi lang="en">&#x327;</bdi><code class="uname">U+0327 COMBINING CEDILLA</code></span>. These strings are visually identical. Applying Unicode Normalization Form C or Form D to both would cause them to use the same code point (and thus code unit) sequences. However, they compare differently because <code>0x00C7</code> is greater than <code>0x0043</code>:</p>
        <table>
			<thead>
				<tr>
					<th style="text-align:center">String A</th>
					<th></th>
					<th style="text-align:center">String B</th>
				</tr>
			</thead>
			<tbody>
				<tr>
					<td class="exampleChar">&#xC7;</td>
					<td style="vertical-align:middle">&gt;</td>
					<td class="exampleChar">C&#x327;</td>
				</tr>
				<tr>
					<td><code>U+00C7</code></td>
					<td></td>
					<td><code>U+0043 U+0327</code></td>
				</tr>
			</tbody>
        </table>
    </aside>
    
 
	</section>
	<section id="human_sorting" class="subtopic">
	<h4>Human-visible Sorting</h4>
		
    <p>Specifications or applications that need to deal with sorting natural language text for display to users face some additional complexity. Unicode defines a default collation (sorting) order as part of the <cite>Unicode Collation Algorithm</cite> [[UTS10]], which is then tailored to meet the needs of specific languages, [=locales=], and cultures.</p>
    
    <div class="req" id="char_sort_user">
	<p class="advisement">When sorting text for presentation to users, the sort order SHOULD be tailored according to the most appropriate [=locale=] for the specific user in that application; thus the presentation order may differ from user to user.</p>
	<details class="links"><summary>explanations &amp; examples</summary>
	<p><cite>Unicode Collation Algorithm</cite> [[UTS10]]</p>
	<p><a href="https://www.unicode.org/reports/tr35/tr35-collation.html">Collation section of Locale Data Markup Language</a> [[UTS35]]</p>
	<p><a href="https://www.w3.org/TR/charmod/#sec-CollationUnits">Units of collation, C007</a>, in <cite>Character Model for the World Wide Web: Fundamentals</cite></p>
	</details>
	</div>
	
	<p>Languages and cultures vary in how they sort text or use their alphabet or writing system to organize textual data. For example, German language speakers treat the letter <span class="codepoint" translate="no"><bdi lang="de">&#xFC;</bdi><code class="uname">U+00FC LATIN SMALL LETTER U WITH DIAERISIS</code></span> as sorting similar to the letter <strong>u</strong> (there are actually <em>two</em> German sorting sequences, which are slightly different in the exact handling of this letter), while Danish language speakers treat the same letter as separate in the alphabet and sort it after the letter "y".</p>
	
	<p>Determining which locale to use for a sorted list can depend on a number of factors. For example, an application might sort a list of values according to the localization of the page in which the data appears. In other cases it might make more sense to sort according to the runtime locale of the user-agent or according to some parameter passed in an API. The important thing to recognize is that this order might be different for different users or on different systems.</p>
		
<!-- Charmod:Fundamentals included the below best practices, which seem out of date
  	<div class="req" id="char_sort_units">
	<p class="advisement">Software that sorts or searches text for display to users SHOULD do so on the basis of appropriate collation units and ordering rules for the relevant language and/or application.</p>
	<details class="links"><summary>explanations &amp; examples</summary>
	<p><a href="https://www.w3.org/TR/charmod/#sec-CollationUnits">Units of collation, C006</a>, in <cite>Character Model for the World Wide Web: Fundamentals</cite></p>
	</details>
	</div>
	
  	<div class="req" id="char_sort_alternatives">
	<p class="advisement">Software that allows users to sort or search text SHOULD allow the user to select alternative rules for collation units and ordering.</p>
	<details class="links"><summary>explanations &amp; examples</summary>
	<p><a href="https://www.w3.org/TR/charmod/#sec-CollationUnits">Units of collation, C066</a>, in <cite>Character Model for the World Wide Web: Fundamentals</cite></p>
	</details>
	</div>

  	<div class="req" id="char_sort_anything">
	<p class="advisement">Specifications and implementations of sorting and searching algorithms SHOULD accommodate text that contains any character in Unicode.</p>
	<details class="links"><summary>explanations &amp; examples</summary>
	<p><a href="https://www.w3.org/TR/charmod/#sec-CollationUnits">Units of collation, C008</a>, in <cite>Character Model for the World Wide Web: Fundamentals</cite></p>
	</details>
	</div>
-->
    </section>
    <section id="string-search" class="subtopic">
		<h4>Searching</h4>

	<aside class="warning" id="find-is-hard" title="Under construction">
		<p>Searching text is a complex topic and this document does not currently contain a complete list of recommendations and best practices. The Internationalization Working Group has compiled a list of just a <em>few</em> of the issues involved as part of on-going work to document this space in our document <cite><a href="https://w3c.github.io/string-search">String Searching</a></cite>.</p>
	</aside>
	</section>

</section>
</section>


<section id="sec_resid_non_ascii" class="topic">

<h2 id="resid_misc">Resource identifiers</h2><!-- originally this section has a subsection "basics" with this id -->

<div id="sec_resid_non_ascii_checklist" class="summaryC"></div>

<p class="reviewComments"><a href="https://github.com/w3c/i18n-activity/labels/t%3Aresid_misc" target="_blank">See related review comments.</a></p>

<aside class="links" id="links_sec_resid_basic">
    <p class="links_title">Useful background and overviews for this section</p>
    <ul>
        <li class="w3"><p class="link"><a href="https://www.rfc-editor.org/rfc/rfc3986">Uniform Resource Identifiers (URIs)</a> [[RFC3986]]</p></li>
        <li class="w3"><p class="link"><a href="https://url.spec.whatwg.org/">WhatWG URL Specification</a> [[URL]]</p></li>
        <li class="w3"><p class="link"><a href="https://www.rfc-editor.org/rfc/rfc3987">Internationalized Resource Identifiers (IRIs)</a> [[RFC3987]]</p></li>
    </ul>
</aside>

    <p>The situation with regards to specifying support of non-ASCII characters in <a>resource identifiers</a> is complicated because there are at least three specifications (URI [[RFC3986]], IRI [[RFC3987]], and [[URL]]) that define resource identifiers and their serialization. The WhatWG [[URL]] specification is an attempt to address this complexity by documenting the actual practice of browsers and other user agents. The stated goal of the URL specification is to obsolete both RFCs.</p>
    
    <p>In general, document formats on the Web use resource identifiers that encode non-ASCII characters as plain text, that is, as "IRIs". Protocols such as&mdash;but not limited to&mdash;HTTP [[RFC9110]]) use resource identifiers that encode non-ASCII characters as a sequence of bytes using <a>percent encoding</a>, that is, as "URIs". Because [[RFC3986]] does not specify any particular <a>character encoding</a> for encoding characters to bytes, the <a>percent encoding</a> escapes are prone to misinterpretation. To help combat this, many modern protocols and specifications expect resource identifiers to use the UTF-8 character encoding, exactly as specified by IRI, when encoding characters into the subset of ASCII supported in wire formats and protocols.</p>
    
    <aside class="example" title="Comparison of IRI to URI">
		<p>Here's an example of an IRI and its URI-encoded equivalent, where the <strong>path</strong> of the identifier is the Japanese word <span lang="ja" translate="no">文字化け</span> ("<a>mojibake</a>"). This word consists of a sequence of four Unicode code points: <code>U+6587 U+5B57 U+5316 U+3051</code>. In UTF-8, each of these code points requires three-bytes to encode, so the byte sequences in hex are: <code translate="no">0xE6.96.87 0xE5.AD.97 0xE5.8C.96 0xE3.81.91</code></p>
		<table>
			<thead>
				<tr>
					<th>Format</th>
					<th>Example</th>
				</tr>
			</thead>
			<tbody>
				<tr>
					<td>IRI</td>
					<td><code class="kw" translate="no">https://www.example.com/文字化け</code></td>
				</tr>
				<tr>
					<td>URI</td>
					<td><code class="kw" translate="no">https://www.example.com/%E6%96%87%E5%AD%97%E5%8C%96%E3%81%91</code></td>
				</tr>
			</tbody>
		</table>
    </aside>

    <div class="req" id="resid_use_iris">
	    <p class="advisement">Specifications that define <a>resource identifiers</a> MUST permit the use of non-ASCII characters.</p>
	    <details class="links"><summary>more</summary>
	        <p><a href="https://github.com/w3c/web-annotation/issues/241">Model is defined in terms of IRIs; Protocol with URI</a>. GitHub issue discussion.</p>
	    </details>
	</div>
	
	<p>Document formats or protocols need to support resource identifiers that contain non-ASCII characters because in many cases the names or identifiers for a given resource are generated from user input. Users generally are not restricted and should not be restricted in their ability to use their own language for these values.</p>

    <div class="req" id="resid_what_to_spec">
		<p class="advisement">Specifications on the Web that define a document format, data structure, or API SHOULD reference [[URL]] when specifying <a>resource identifiers</a>. For cases unsupported by the [[URL]] specification, IRI [[RFC3987]] MAY be specified instead.</p>
	</div>
	<div class="req" id="resid_what_to_spec_protocol">
		<p class="advisement">Specifications that define protocols MAY reference URI [[RFC3986]] when specifying <a>resource identifiers</a> for use in wire formats but MUST include the additional requirement that UTF-8 MUST be used for the interpretation of <a>percent encoded</a> values into characters.</p>
    </div>

    <p>According to the definition in [[RFC3986]], URI references are restricted to a subset of ASCII and non-ASCII characters cannot be used directly. The <a>percent encoding</a> is provided to escape arbitrary byte values. However, <a>percent encoding</a> by itself is of limited value because many different <a>legacy character encodings</a> might be used to interpret a given sequence of bytes into characters (or to encode a given sequence of characters into bytes). Internationalized Resource Identifiers (<em>IRIs</em>) [[RFC3987]] solves problems with encoding and interpreting non-ASCII characters in resource identifiers with a uniform approached based on the UTF-8 encoding of [[Unicode]].</p>


	<div class="req" id="resid_char_limits">
	    <p class="advisement">A specification MAY impose its own limitations on which characters are permitted in a <a>resource identifier</a>, but these should be focused on characters that conflict with the syntax of resource identifiers, the transport format, or with other elements defined by the specification itself.</p>
	</div>
	
	<p>While generally not recommended, if additional restrictions are contemplated, review [[UAX31]] and [[CHARMOD-NORM]] for additional guidance.</p>

	<div class="req" id="resid_new_schemes">
		<p class="advisement">Specifications that define new syntax for URIs or contained within URIs MUST specify that characters outside the ASCII repertoire are <a>percent encoded</a> using the UTF-8 <a>character encoding</a>.</p>
	</div>

  </section>


<section id="markup" class="topic">
<h2>Document formats, markup &amp; syntax</h2>


<div id="markup_checklist" class="summaryC"></div>


<!--p>In this section:</p>
	<ul class="summary">
	<li>[[[#markup_elements_attributes]]]</li>
	<li>[[[#markup_identifiers]]]</li>
	<li>[[[#markup_plaintext]]]</li>
	</ul-->

<p>Specifications that deal with formal languages, document formats, protocols, or APIs often need to define markup, syntax, or <a>application internal identifiers</a>. The best practices in this section cover the different needs when defining these.</p>

<p>Specifications that are defining a markup language or a syntax based on a given markup language are concerned with defining elements, attributes, and their values. For example, an [[XML]] DTD defines elements and attributes that are valid in a specific document type.</p>

<p>Specifications that are defining a given document format, protocol, or API are usually concerned with defining identifiers for reserved keywords, field names, or permitted values. Many of these are <a>application internal identifiers</a>, whose names and values are completely defined by the specification. In some cases the specification will permit some or all of these to be a <a>user-supplied value</a> which can be filled in or named by users.</p>

<aside class="example">
	
<p>Let's use CSS to illustrate some of these concepts. Here is a fragment of a style sheet:</p>
<pre class="css">
p.myClassName {
  text-align: center;
  color: red;
}
</pre>

<p>The <code>p.myClassName</code> rule contains two properties (<code>text-align</code> and <code>color</code>). Each of these is an <a>application internal identifier</a>.</p>

<p>Each property has been assigned a value from a list of available values defined by CSS. For example, here the property <code>text-align</code> has been given the value <code>center</code>. Other possible values for this property are part of a list of <a>application internal identifiers</a> defined by [[CSS3-TEXT]] and include <code>start</code>, <code>end</code>, <code>left</code>, <code>right</code>, <code>justify</code>, <code>match-parent</code>, or <code>justify-all</code>.</p>

<p>The class name <code>myClassName</code> is an example of a <a>user-supplied value</a>. A user might assign a class name using any valid class identifier in CSS, including using words in other languages, such as using an identifier in Japanese:</p>

<pre class="css" lang="ja">
p.私のクラス名 {
	color: #434322;
}
</pre>
</aside>


<section id="markup_elements_attributes" class="subtopic">
<h3>Defining elements and attributes in markup</h3>

<p class="reviewComments"><a href="https://github.com/w3c/i18n-activity/labels/t%3Amarkup_elements_attributes" target="_blank">See related review comments.</a></p>

  	<div class="req" id="markup_attributes">
	<p class="advisement">Do not define attribute values that will contain user readable content. Use elements for such content.</p>
	<details class="links"><summary>explanations &amp; examples</summary>
	<p><a href="https://www.w3.org/TR/xml-i18n-bp/#DevAttributes">Best Practice 3: Avoiding translatable attribute values</a>, in Best Practices for XML Internationalization</p>
	</details>
	</div>

  	<div class="req" id="markup_attributes_fallback">
	<p class="advisement">If you do define attribute values containing user readable content, provide a means to indicate directional and language information for that text separately from the text contained in the element.</p>
	</div>

  	<div class="req" id="markup_span">
	<p class="advisement">Provide a way for authors to annotate arbitrary inline content using a <code class="kw" translate="no">span</code>-like element or construct.</p>
	<details class="links"><summary>explanations &amp; examples</summary>
	<p><a href="https://www.w3.org/TR/xml-i18n-bp/#DevSpan">Best Practice 14: Defining a span-like element</a>, in Best Practices for XML Internationalization</p>
	</details>
	</div>
</section>
	
<section id="markup_plaintext" class="subtopic">
<h3>Handling plain text in markup</h3>

<p class="reviewComments"><a href="https://github.com/w3c/i18n-activity/labels/t%3Amarkup_plaintext" target="_blank">See related review comments.</a></p>

  	<div class="req" id="plain_avoid">
	<p class="advisement">Avoid <a>natural language</a> text in elements or attribute values that only allow for plain text.</p>
	</div>
	
	<div class="req" id="plain_attr_avoid">
	<p class="advisement">Avoid defining attribute values whose content will be <a>natural language</a> text.</p>
	</div>

  	<div class="req" id="plain_span">
	<p class="advisement">Provide a span-like element that can be used for any text content to apply information needed for internationalization.</p>
	</div>


<p>Internationalization information may include language and base direction metadata, inline changes of language, bidirectional text behavioural changes, translate flags, etc.</p>
</section>


<section id="markup_identifiers" class="subtopic">
<h3>Defining identifiers</h3>

<p class="reviewComments"><a href="https://github.com/w3c/i18n-activity/labels/t%3Amarkup_identifiers" target="_blank">See related review comments.</a></p>

    <p>A common feature of document formats is the definition of various identifiers. This includes reserved keywords as well as user-defined values. To foster interoperability, implementations need to be able to match identifier values reliably and consistently. For a detailed look at this problem, see <cite>Character Model: String Matching</cite> [[CHARMOD-NORM]].</p>
    
	<div class="req" id="identifier_content_internal_id">
	<p class="advisement">Specifications that define <a>application internal identifiers</a> (which are never shown to users and are always used for matching or processing within an application or protocol) should limit the content to a printable subset of ASCII. ASCII case-insensitive matching is recommended.</p>
	<details class="links"><summary>explanations &amp; examples</summary>
	    <p><a href="https://www.w3.org/TR/charmod-norm/#specifying-content-restrictions">Specifying Content Restrictions</a> in [[CHARMOD-NORM]]</p>
	</details>
	</div>
	
	<p>Sometimes specifications need to define a set of identifiers that content authors interact with or which are meaningful to various types of end-users. Restricting the set of allowable characters to ASCII impedes usability, particularly for speakers of languages that do not use the Latin script or that use characters outside of the ASCII range.</p>
	
    <div class="req" id="identifier_content_visible">
	<p class="advisement">When identifiers are visible or potentially visible to users, specifications should allow the use of non-ASCII Unicode characters, in order to ensure that users in all languages can use the resulting document format or protocol with equal access. Case sensitivity (i.e. no case folding) is recommended.</p>
	<details class="links"><summary>explanations &amp; examples</summary>
	    <p><a href="https://www.w3.org/TR/charmod-norm/#specifying-content-restrictions">Specifying Content Restrictions</a> in [[CHARMOD-NORM]]</p>
	</details>
	</div>
		
	<div class="req" id="identifier_non_ascii_namespace">
	<p class="advisement">If <a>application internal identifiers</a> are not restricted to ASCII, specifications should define the characters that are allowed to start and be part of a valid identifier.</p>
	<details class="links"><summary>explanations &amp; examples</summary>
	    <p><a href="https://unicode.org/reports/tr31/">Unicode Identifier and Pattern Syntax</a> [[UAX31]]</p>
        <p><a href="http://es5.github.io/x7.html#x7.6">Example</a>: ECMAScript 5, section 7.6 <em>Identifier Names and Identifiers</em></p>
	</details>
	</div>
	
	<p>One key issue when defining an identifier namespace or set of identifiers in a new specification is the handling of combining marks and certain other characters (such as joiners or bidi controls) when parsing the document format: special focus needs to be paid to how the identifier can be "tokenized" (separated from the surrounding text). One means of doing this is to restrict the range of characters allowed to <em>start</em> an identifier to ensure that normal text processing doesn't interfere with matching the identifier later.</p>
	
	<p><a href="https://unicode.org/reports/tr31/"><cite>Unicode Identifier and Pattern Syntax</cite></a> [[UAX31]] provides one model, used notably in programming languages such as Java or <a href="http://es5.github.io/x7.html#x7.6">JavaScript</a>. HTML and CSS also provide <a href="https://html.spec.whatwg.org/multipage/custom-elements.html#valid-custom-element-name">character range definitions</a> for custom identifiers, such as this <a href="https://www.w3.org/TR/xml/#sec-notation">EBNF</a> [[XML]] production:</p>
	
	<pre>
PCENChar ::=
    "-" | "." | [0-9] | "_" | [a-zA-Z] | #xB7 | [#xC0-#xD6] | [#xD8-#xF6] | [#xF8-#x37D] | 
    [#x37F-#x1FFF] | [#x200C-#x200D] | [#x203F-#x2040] | [#x2070-#x218F] | [#x2C00-#x2FEF] |
    [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] | [#x10000-#xEFFFF]
	</pre>
	
	<div class="note">
		<p>HTML and CSS processing is defined such that Unicode character properties (such as whether a given character is a combining mark) are not considered when parsing identifiers and tokens. This allows identifiers to start with a combining character and still be processed reliably, but a plain text editor might not handle the value identically.</p>
	</div>
	
	<p>Specifications should exercise care when defining identifiers with regards to the handling of whitespace. Note that there are Unicode horizontal whitespace characters other than the ASCII characters <span class="codepoint" translate="no"><img alt="SP" src="images/0020.png"><code class="uname">U+0020 SPACE</code></span> and <span class="codepoint" translate="no"><img alt="HTAB" src="images/0009.png"><code class="uname">U+0009 TAB</code></span>.</p>
	
	<div class="req" id="identifier_content_surrogates">
	<p class="advisement">Specifications should not allow surrogate <a>code points</a> (<code>U+D800</code> to <code>U+DFFF</code>) or non-character code points in identifiers.</p>
	<details class="links"><summary>explanations &amp; examples</summary>
	    <p><a href="https://www.w3.org/TR/charmod-norm/#specifying-content-restrictions">Specifying Content Restrictions</a> in [[CHARMOD-NORM]]</p>
	</details>
	</div>
	
	<div class="req" id="identifier_content_controls">
	<p class="advisement">Specifications should not allow the <kbd>C0</kbd> (<code>U+0000</code> to <code>U+001F</code>) and <kbd>C1</kbd> (<code>U+0080</code> to <code>U+009F</code>) control characters in identifiers.</p>
	<details class="links"><summary>explanations &amp; examples</summary>
	    <p><a href="https://www.w3.org/TR/charmod-norm/#specifying-content-restrictions">Specifying Content Restrictions</a> in [[CHARMOD-NORM]]</p>
	</details>
	</div>

  	<div class="req" id="identifier_case">
	<p class="advisement">Identifiers should be case-sensitive when non-ASCII characters are allowed and case <strong>insensitive</strong> when only ASCII characters are allowed.</p>
	<details class="links"><summary>explanations &amp; examples</summary>
	    <p><a href="https://www.w3.org/TR/charmod-norm/#specifying-content-restrictions">Specifying Content Restrictions</a> in [[CHARMOD-NORM]]</p>
	</details>
	</div>
	
	
	<div class="req" id="identifier_content_display">
	<p class="advisement"><a>Application internal identifier</a> fields or values must be wrapped with a localizable display value when displayed to end-users.</p>
	<details class="links"><summary>explanations &amp; examples</summary>
	    <p><a href="https://www.w3.org/TR/charmod-norm/#specifying-content-restrictions">Specifying Content Restrictions</a> in [[CHARMOD-NORM]]</p>
	</details>
	</div>
	
   <div class="req" id="identifier_locale_neutral_names">
   <p class="advisement">Choose locale-neutral and culturally-neutral names for fields and values.</p>
   </div>
   
   <p>When defining identifiers, including field names and values, choose names that are as culturally-neutral as possible. For example, prefer <code>postalCode</code> to the (USA-specific) <code>ZIPCode</code> or prefer <code>givenName/familyName</code> to the more-culturally linked <code>firstName/lastName</code>.</p>
		
	
<section id="application_internal" class="subtopic">
<h4>Defining application-internal data values</h4>

   <p>Some specifications need to define the values for a given field in a document format or protocol. When the data values are associated with a specific type, such as numbers or dates, the format of the field is usually defined using some well-known schema, such as [[XMLSCHEMA11-2]] or [[JSON-SCHEMA]].</p>
   
   <div class="req" id="application_internal_code_like">
   <p class="advisement">Specifications that define non-localizable string data values intended to be machine-readable should use values that are not readily confused with natural language text.</p>
   </div>
   
   <p>Many protocols, document formats, or data structures define enumerated values for internal use. These values are not meant to be visible to humans directly. Sometimes it is helpful if these values are given descriptive names (often in English) to aid users working with the specification, protocol, or API or who might need to debug a given document or interaction. When assigning these values in a specification, the names chosen should appear to be "code-like" so that users do not assume that the value can be displayed as if it were natural language text.</p>
   
   <p>There are several styles that different groups have adopted to make application-internal values look "code-like". Choose the one best suited to your specification. These include:</p>
   
   <ul>
      <li><strong>SNAKE_CASE</strong>. Snake case uses ASCII letters and digits, all in uppercase, with words separated by underscores (<span class="codepoint"><bdi lang="en">_</bdi><code class="uname">U+005F LOW LINE</code></span>).</li>
      <li><strong>PascalCase</strong> or <strong>camelCase</strong>. These use ASCII letters and digits, with each "word" inside the identifer being capitalized.</li>
   </ul>
   
   <aside class="example" title="Example of code internal values">
	   <p>For example, a specification might define a document with a set of fields named in camelCase whose predefined values are named in SNAKE_CASE:</p>
	   
	   <pre>
type: [ PURCHASE_REQUEST, INVOICE_REQUEST, RECEIPT_REQUEST ]
responseType: [ EMAIL, SMS_MESSAGE, JSON_DOCUMENT ]
transactionResult: [ SUCCESS, NOT_AUTHORIZED, INTERNAL_ERROR, UNKNOWN_ERROR ]
	   </pre>
   </aside>
   
   <aside class="note">
	   <p>Enumerated values defined by a specification should not be considered as "natural language strings". While they generally use English language words to form the identifiers, they are not intended for end-users to view nor are they free-form text values. Developers are expected to read the definition backing the keywords in enumerated values (the description of which can be localized). In fact, it is a best practice to define values in a locale-neutral way and wrap that with display strings.</p>
   </aside>
   
   <div class="req" id="application_internal_localizable_metadata">
   <p class="advisement">Fields whose content is intended for consumption by humans must always be treated as <a>natural language</a> string values. It must be possible to find the language and base direction metadata for every such field.</p>
   </div>
   
   <p>Fields that contain human-readable strings, particularly those of a descriptive nature, must be assumed to be natural language strings. This is true even if the user viewing the string is expected to be a software developer. It must be possible to determine the language tag and string direction for each such field in a document or data structure.</p>
   
   <p>Common names for fields of this type include <code>name</code>, <code>description</code>, <code>title</code>, <code>message</code>, or occassionally <code>value</code>. One test for this is if, as a specification author or user, you are uncomfortable making the content of the field <kbd>SNAKE_CASE_SHOUTED</kbd>, the field might be better considered as natural language text.</p>
   
   <div class="req" id="application_internal_localizable">
   <p class="advisement">Fields intended for consumption by humans should be localizable.</p>
   </div>
   
   <p>This can take various forms. For example, a specification or protocol might allow for language negotiation and only return the best matching localized strings. Or a given resource might contain multiple languages that the <a>consumer</a> can choose between.</p>
   
   <div class="req" id="application_internal_localizable_wrapper">
   <p class="advisement">Field names and other enumerated values should be wrapped with localizable display names.</p>
   </div>
   
   <p>Field names and enumerated values are not natural language text, even if the names appear to be plain text and might be understood by users. These fields and values should not have language or direction metadata associated with them and, where necessary, implementers should be guided by the specification to provide appropriate localized wrapping.</p>
   
</section>
</section>
</section>


<section id="typography" class="topic">
<h2>Typographic support</h2>


<div id="typography_checklist" class="summaryC"></div>


<!--p>In this section:</p>
	<ul class="summary">
	<li>[[[#text_decoration]]]</li>
	<li>[[[#vertical]]]</li>
	<li>[[[#cursive]]]</li>
	<li>[[[#box_posn]]]</li>
	<li>[[[#ruby]]]</li>
	<li>[[[#typ_misc]]]</li>
	</ul-->


<section id="typ_text_decoration" class="subtopic">
<h3>Text decoration</h3>

<p class="reviewComments"><a href="https://github.com/w3c/i18n-activity/labels/t%3Atyp_text_decoration" target="_blank">See related review comments.</a></p>

<aside class="links" id="links_text_decoration">
<p class="links_title">Useful background and overviews for this section</p>
<ul>
<li class="w3"><p class="link"><a href="https://www.w3.org/TR/typography/#text_decoration">Text decoration</a>, in the Language enablement index.</p></li>
</ul>
</aside>


  	<div class="req" id="textdec_skip">
	<p class="advisement">Text decoration such as underline and overline should allow lines to skip ink.</p>
	</div>

  	<div class="req" id="textdec_distance">
	<p class="advisement">It should be possible to specify the distance of overlines and underlines from the text.</p>
	</div>

    <p>Skipping ink for text decoration such as underlines may not be appropriate for some scripts, such as Arabic, which prefers to move the underline further away from the baseline instead.</p>
</section>


<section id="typ_vertical" class="subtopic">
<h3>Vertical text</h3>

<p class="reviewComments"><a href="https://github.com/w3c/i18n-activity/labels/t%3Atyp_vertical" target="_blank">See related review comments.</a></p>

<aside class="links" id="links_vertical">
<p class="links_title">Useful background and overviews for this section</p>
<ul>
<li class="w3"><p class="link"><a href="https://www.w3.org/TR/typography/#vertical_text">Vertical text</a>, in the Language enablement index.</p></li>
</ul>
</aside>


  	<div class="req" id="vertical_support">
	<p class="advisement">It should be possible to render text vertically for languages such as Japanese, Chinese, Korean, Mongolian, etc.</p>
	</div>

  	<div class="req" id="vertical_lr_rl">
	<p class="advisement">Vertical text must support line progression from LTR (eg. Mongolian) and RTL (eg. Japanese).</p>
	</div>

  	<div class="req" id="vertical_lr_rl2">
	<p class="advisement">By default, text decoration, ruby, and the like in vertical text where lines are stacked from left to right (eg. Mongolian) should appear on the same side as for CJK vertical text. Placement should not rely on the <code class="kw" translate="no">before</code> and <code class="kw" translate="no">after</code> line locations.</p>
	</div>

  	<div class="req" id="vertical_utr50">
	<p class="advisement">Vertical writing modes that are equivalent to the <code class="kw" translate="no">vertical-</code> values in CSS (only) should use [[UTR50]] to apply default text orientation of characters. (This does not apply to writing modes that are equivalent to <code class="kw" translate="no">sideways-</code> in CSS.)</p>
	</div>

	<div class="req" id="vertical_sideways">
	<p class="advisement">Writing modes should provide values like <code class="kw" translate="no">sideways-lr</code> and <code class="kw" translate="no">sideways-rl</code> in CSS to allow for vertical rotation of  lines of horizontal script text. UTR50 is not applicable for these cases.</p>
	</div>

	<div class="req" id="vertical_upright">
	<p class="advisement">By default, glyphs of scripts that are normally horizontal should run along a line in vertical text such that the top of the character is toward the right side of the vertical line, but there should also be a mechanism to allow them to progress down the line in upright orientation. Such a mechanism should use grapheme clusters as a minimum text unit, but where necessary allow syllabic clusters to be treated as a unit when they involve more than one grapheme cluster.</p>
	</div>

  	<div class="req" id="vertical_upright_arabic">
	<p class="advisement">Upright Arabic text in vertical lines should use isolated letter forms and the order of text should read top to bottom.</p>
	</div>

  	<div class="req" id="vertical_tatechuyoko">
	<p class="advisement">It should be possible for some sequences of characters (particularly digits) to run horizontally within vertical lines (tate chu yoko).</p>
	</div>
</section>


<section id="typ_bidi" class="subtopic">
<h3>RTL/bidi text</h3>

<p class="reviewComments"><a href="https://github.com/w3c/i18n-activity/labels/t%3Atyp_bidi_x" target="_blank">See related review comments.</a></p>
<aside class="links" id="links_bidi_text">
<p class="links_title">Useful background and overviews for this section</p>
<ul>
<li class="w3"><p class="link"><a href="https://www.w3.org/TR/typography/#bidi_text">Bidirectional text</a>, in the Language enablement index.</p></li>
</ul>
</aside>

<div class="req" id="italics_left">
	<p class="advisement">Specifications that enable sloping of letterforms SHOULD provide for characters to slope either to the right or to the left according to the needs of the specific language.</p>
</div>

</section>


<section id="typ_box_posn" class="subtopic">
<h3>Setting box positioning coordinates when text direction varies</h3>

  	<div class="req" id="vertical_box_posn">
	<p class="advisement">Box positioning coordinates must take into account whether the text is horizontal or vertical.</p>
	</div>

<p>It is typical, when localizing a user interface or web page, to create mirror-images for the RTL and LTR versions. For example, it is likely that a box that appears near the left side of a window containing English content would appear near the right side of the window if the content is Arabic or Hebrew. It should preferably automatic for this to change, based on the base direction of the current context, unless there is a strong reason for using absolute geometry. One way to achieve this is to use keywords such as <code class="kw" translate="no">start</code> and <code class="kw" translate="no">end</code>, rather than <code class="kw" translate="no">left</code> and <code class="kw" translate="no">right</code>, to indicate position.</p>
  </section>


<section id="typ_logical" class="subtopic">
<h3><em>Logical properties (TBD)</em></h3>

<p class="reviewComments"><a href="https://github.com/w3c/i18n-activity/labels/t%3Atyp_logical_x" target="_blank">See related review comments.</a></p>
</section>


<section id="typ_cursive" class="subtopic">
<h3>Cursive text</h3>


<p class="reviewComments"><a href="https://github.com/w3c/i18n-activity/labels/t%3Atyp_cursive" target="_blank">See related review comments.</a></p>
<aside class="links" id="links_cursive">
<p class="links_title">Useful background and overviews for this section</p>
<ul>
<li class="w3"><p class="link"><a href="https://www.w3.org/TR/typography/#cursive">Cursive text</a>, in the Language enablement index.</p></li>
</ul>
</aside>


  	<div class="req" id="cursive_opacity">
	<p class="advisement">Overlaps should not be exposed when transparency is applied to the joined letters in cursive text, such as for Arabic, Mongolian, and N'Ko.</p>
	</div>

  	<div class="req" id="cursive_border">
	<p class="advisement">When adding a text stroke or shadow, joined letters should not be separated from their neighbors in cursive script text.</p>
	</div>
</section>


<section id="typ_ruby" class="subtopic">
<h3>Ruby text annotations</h3>

<p class="reviewComments"><a href="https://github.com/w3c/i18n-activity/labels/t%3Atyp_ruby" target="_blank">See related review comments.</a></p>

<aside class="links" id="links_ruby">
<p class="links_title">Useful background and overviews for this section</p>
<ul>
<li class="w3"><p class="link"><a href="https://www.w3.org/International/questions/qa-ruby">What is ruby?</a>, W3C article.</p></li>
<li class="w3"><p class="link"><a href="https://www.w3.org/International/articles/ruby/markup.en.html">Ruby Markup</a>, W3C article.</p></li>
<li class="w3"><p class="link"><a href="https://www.w3.org/TR/typography/#inline_notes">Inline notes & annotations</a>, in the Language enablement index.</p></li>
</ul>
</aside>


  	<div class="req" id="type_ruby">
	<p class="advisement">'Ruby' style annotations alongside base text should be supported for Chinese, Japanese, Korean and Mongolian text, in both horizontal and vertical writing modes.</p>
	</div>

  	<div class="req" id="ruby_zhuyin">
	<p class="advisement">Ruby implementations should  support zhuyin fuhao (bopomofo) ruby for Traditional Chinese.</p>
	</div>

  	<div class="req" id="ruby_tabular">
	<p class="advisement">Ruby implementations should  support a tabular content model (such that ruby contents can be arranged in a sequence approximating to <code class="kw" translate="no">rb rb rt rt</code>).</p>
	</div>

   	<div class="req" id="ruby_rb">
	<p class="advisement">Ruby implementations should make it possible to use an explicit element for ruby bases, like the <code class="kw" translate="no">rb</code> element in HTML.</p>
	</div>

   	<div class="req" id="ruby_dblsided">
	<p class="advisement">Ruby implementations should allow annotations to appear on either or both sides of the base text.</p>
	</div>
	<div class="req" id="ruby_cjk">
	<p class="advisement">Ruby markup in HTML is designed specifically for Chinese, Japanese, Korean, and Mongolian requirements, and should not be used as a general glossing mechanism.</p>
	</div>
</section>


<section id="typ_font_management" class="subtopic">
<h3><em>Font management (TBD)</em></h3>

<p class="reviewComments"><a href="https://github.com/w3c/i18n-activity/labels/t%3Atyp_font_management_x" target="_blank">See related review comments.</a></p>
</section>


<section id="typ_misc" class="subtopic">
<h3>Miscellaneous</h3>

<p class="reviewComments"><a href="https://github.com/w3c/i18n-activity/labels/t%3Atyp_misc" target="_blank">See related review comments.</a></p>

   	<div class="req" id="type_line_height">
	<p class="advisement">Line heights must allow for characters that are taller than English.</p>
	</div>

   	<div class="req" id="type_box_size">
	<p class="advisement">Box sizes must allow for text expansion in translation.</p>
	</div>

   	<div class="req" id="type_linebreak">
	<p class="advisement">Line wrapping should take into account the special rules needed for non-Latin scripts.</p>
	</div>

	<p>Various non-Latin writing systems don't simply wrap text on inter-word spaces. They have additional rules that must be respected. For example</p>
	<ul>
	<li>Chinese, Japanese and Korean wrap after characters, but don't put certain characters at the start/end of a line.</li>
	<li>Thai and other SE Asian scripts wrap at word boundaries, but words are not delimited by spaces – spaces are instead used to separate phrases.</li>
	<li>Tibetan wraps after the tsek character that follows a syllable – words are not separated by spaces, and lines can break within a word.</li>
	<li>Indic and other complex scripts break at orthographic syllable boundaries, which are often two or more grapheme clusters.</li>
	</ul>
<p>See the <a href="https://www.w3.org/TR/css-text-3/#line-breaking">CSS Text Level 3 specification</a> for additional background. (<a href="https://r12a.github.io/scripts/tutorial/part5">This tutorial</a> provides additional examples, if needed.)</p>


   	<div class="req" id="type_presentational_tags">
	<p class="advisement">Avoid specifying presentational tags, such as <code class="kw" translate="no">b</code> for bold, and <code class="kw" translate="no">i</code> for italic. </p>
	<details class="links"><summary>more</summary>
	<p><a href="https://www.w3.org/International/questions/qa-b-and-i-tags">Using &lt;b&gt; and &lt;i&gt; elements</a>, W3C article.</p>
	</details>
	</div>

<p> It is best to avoid presentational markup <code class="kw" translate="no">b</code>, <code class="kw" translate="no">i</code> or <code class="kw" translate="no">u</code>, since it isn't interoperable across writing systems and furthermore may cause unnecessary problems for localisation. In addition, some scripts have native approaches to things such as emphasis, that do not involve, and can be very different from, bolding, italicisation, etc.</p>
<p>In the HTML case, there was a legacy issue, but unless there is one for your specification, the recommendation is that styling be used instead to determine the presentation of the text, and that any markup or tagging should allow for general semantic approaches.</p>
<p>For an explanation of the issues surrounding <code class="kw" translate="no">b</code> and <code class="kw" translate="no">i</code> tags, see <a href="https://www.w3.org/International/questions/qa-b-and-i-tags">Using &lt;b&gt; and &lt;i&gt; elements</a>.</p>
</section>
</section>


<section id="locale" class="topic">
<h2>Locales, date and time values, and locally affected formats</h2>


<div id="locale_checklist" class="summaryC"></div>


<!--p>In this section:</p>
	<ul class="summary">
	<li>[[[#loc_neutral]]]</li>
	<li>[[[#loc_time]]]</li>
	<li>[[[#loc_names]]]</li>
	<li>[[[#loc_forms]]]</li>
	<li>[[[#loc_numbers]]]</li>
	</ul-->


<section id="loc_neutral" class="subtopic">
<h3>Working with locale-affected values</h3>

<p>Software systems that support languages and cultural preferences are said to be <q>internationalized</q>. An internationalized system uses APIs to provide language or culturally specific processing, based on user preferences. These user preferences are usually referred to as a <q>locale</q>. For more information on general internationalization terminology, see <cite>Language Tags and Locale Identifiers</cite> [[LTLI]]</p>

   	<div class="req" id="loc_neutral_neutral">
	<p class="advisement">When definining data formats, use locale-neutral serialization forms.</p>
	</div>


    <p>Data values that are machine-readable and not specific to any particular language or culture are more durable and less open to misinterpretation than values that use one of the many different cultural representations. Things like dates, currencies, and numbers might look similar but have different meanings in different <a>locales</a>. For example, a date represented as the string <kbd>4/7</kbd> can be read as the 7th of April or the 4th of July depending on the user's preference. Similarly, <kbd>€2,000</kbd> is either two thousand Euros or an over-precise representation of two Euros. By using a locale-neutral format, systems avoid the need to establish specific interchange rules that vary according to the language or location of the user. When the data is already in a locale-specific format, making the locale and language explicit by providing <a>locale</a> parameters (usually in the form of a <a>language tag</a>) allows users to determine how to work with the data or perhaps enable automated translation services.</p>

    <p>Most common data serialization formats are locale-neutral. For example, [[XMLSchema-2]] types such as <code>xsd:integer</code> and <code>xsd:date</code> are intended for locale-neutral data interchange. Using locale-neutral representations allows the data values to be processed accurately without complex parsing or misinterpretation and also allows the data to be presented in the format most comfortable for the consumer of the data in any locale. For example, rather than storing "€2000,00" as a string, it is strongly preferred to exchange a data structure such as:</p>

    <pre>…
<span class="hljs-string">"price"</span> {
    <span class="hljs-string">"value"</span>: <span class="hljs-number">2000.00</span>,
    <span class="hljs-string">"currency"</span>: <span class="hljs-string">"EUR"</span>
}
…</pre>
</section>


<section id="loc_time" class="subtopic">
<h3>Working with time</h3>

<p class="reviewComments"><a href="https://github.com/w3c/i18n-activity/labels/t%3Aloc_time" target="_blank">See related review comments.</a></p>


   	<div class="req" id="loc_time_preCE">
	<p class="advisement">When defining calendar and date systems, be sure to allow for dates prior to the common era, or at least define handling of dates outside the most common range.</p>
	</div>

   	<div class="req" id="loc_time_utc">
	<p class="advisement">When defining time or date data types, ensure that the time zone or relationship to UTC is always defined.</p>
	</div>

   	<div class="req" id="loc_time_floating">
	<p class="advisement">Provide a health warning for conversion of time or date data types that are "floating" to/from incremental types, referring as necessary to the <a href="https://www.w3.org/TR/timezone/"><cite>Time Zones</cite> WG Note</a>.</p>
	<details class="links"><summary>more</summary>
	<p><a href="https://www.w3.org/TR/timezone/">Working with Time Zones</a></p>
	</details>
	</div>

   	<div class="req" id="loc_time_leapsec">
	<p class="advisement">Allow for leap seconds in date and time data types.</p>
	</div>

	<p>These occur occasionally when the number of seconds in a minute is allowed to range from 0 to 60 (ie. there are sixty-ONE seconds in that minute).</p>

   	<div class="req" id="loc_time_consistency">
	<p class="advisement">Use consistent terminology when discussing date and time values. Use 'floating' time for time zone independent values.</p>
	</div>

   	<div class="req" id="loc_time_zone_offset">
	<p class="advisement">Keep separate the definition of time zone from time zone offset.</p>
	</div>

   	<div class="req" id="loc_time_zone_ids">
	<p class="advisement">Use IANA time zone IDs to identify time zones. Do not use offsets or LTO as a proxy for time zone.</p>
	</div>

   	<div class="req" id="loc_time_zone_field">
	<p class="advisement">Use a separate field to identify time zone.</p>
	</div>

   	<div class="req" id="loc_time_week">
	<p class="advisement">When defining rules for a "week", allow for culturally specific rules to be applied.</p>
	</div>

	<p>For example, the weekend is not always Saturday/Sunday; the first day of week is not always Sunday [or Monday or...], etc.</p>

	<div class="req" id="loc_time_week_number">
	<p class="advisement">When defining rules for week number of year, allow for culturally specific rules to be applied.</p>
	</div>

	<div class="req" id="loc_time_13">
	<p class="advisement">When non-Gregorian calendars are permitted, note that the "month" field can go to 13 (undecimber).</p>
	</div>
</section>


<section id="loc_names" class="subtopic">
<h3>Working with personal names</h3>

<p class="reviewComments"><a href="https://github.com/w3c/i18n-activity/labels/t%3Aloc_names" target="_blank">See related review comments.</a></p>


<p>Developers who create applications that use personal names (in web forms, databases, ontologies, and so forth) are often unaware of how different  names can be  in other countries. They build their forms or databases in a way that assumes too much on the part of foreign users. This section provides guidelines for working with personal names from around the world.</p>
<aside class="links" id="links_loc_names">
<p class="links_title">Useful background and overviews for this section</p>
<ul>
<li class="w3"><p class="link"><a href="https://www.w3.org/International/questions/qa-personal-names">Personal names around the world</a>.</p></li>
</ul>
</aside>


<section id="loc_names_field_length">
<h4>Field length &amp; composition</h4>

	<div class="req" id="loc_names_given_family">
	<p class="advisement">Check whether you really need to store or access given name(s) and family name(s) separately.</p>
	<details class="links"><summary>source</summary>
	<p><a href="https://www.w3.org/International/questions/qa-personal-names#singlefield">To split or not to split?</a> in <cite>Personal Names Around the World</cite>.</p>
	</details>
	<details class="links"><summary>examples</summary>
	<p><a href="https://www.w3.org/International/questions/qa-personal-names#examples">Examples of differences</a> in <cite>Personal Names Around the World</cite>.</p>
	</details>
	</div>

<p>Names around the world differ greatly in composition and the order of components (see <a href="https://www.w3.org/International/questions/qa-personal-names">Personal names around the world</a>). This can create difficulties if, for example, you try to split a person's name into smaller parts for storage in a database and then later attempt to retrieve them, especially if some reconstruction is needed. Difficulties include understanding which part of a person's name belongs in which database field (especially when there are more or fewer parts than fields in the database), and dealing with the ordering of name parts when retrieving someone's name from the database for actual use.</p>
<p>If designing a form or database that will accept names from people with a variety of backgrounds, you should ask yourself whether you really need to have separate fields for things like given name and family name. This will depend on what you need to do with the data, but obviously it will be simpler, where it is possible, to just use the full name as the user provides it.</p>


	<div class="req" id="loc_names_limits">
	<p class="advisement">Avoid placing limits on the length of names, or if you do, make allowance for long strings.</p>
	<details class="links"><summary>source</summary>
	<p><a href="https://www.w3.org/International/questions/qa-personal-names#singlefield">To split or not to split?</a> in <cite>Personal Names Around the World</cite>.</p>
	</details>
	<details class="links"><summary>examples</summary>
	<p><a href="https://www.w3.org/International/questions/qa-personal-names#examples">Examples of differences</a> in <cite>Personal Names Around the World</cite>.</p>
	</details>
	</div>

<p>Bear in mind that names in some cultures can be quite a lot longer than your own. Make  fields long enough to enter long names. Also do not assume that a name will have more than one letter. </p>
<p>In particular, avoid counting length in bytes (see [[[#char_string]]]) – do not assume that a four-character Japanese name in UTF-8 will fit in four bytes; you are likely to actually need 12.</p>
</section>


<section id="loc_names_segments">
<h4>Guidelines for segmenting names</h4>


<p>The guidelines in this section apply where a decision has been made that it is necessary to split up a person's name for storage or presentation.</p>


<div class="req" id="loc_names_first_last">
	<p class="advisement">Try to avoid using the labels 'first name' and 'last name'. Consider an alternative such as 'given name(s)' and 'family name(s)'.</p>
	<details class="links"><summary>source</summary>
	<p><a href="https://www.w3.org/International/questions/qa-personal-names#splitting">Strategies for splitting up names</a>, in <cite>Personal Names Around the World</cite>.</p>
	</details>
	<details class="links"><summary>examples</summary>
	<p><a href="https://www.w3.org/International/questions/qa-personal-names#examples">Examples of differences</a> in <cite>Personal Names Around the World</cite>.</p>
	</details>
	</div>

<p>Use of the terms 'first' and 'last' can be confusing for people who normally write their family name followed by given names.  Although it may seem acceptable to use 'first' and 'last' for forms aimed at users in the United States, the forms may eventually be used by people with different cultural backgrounds, both within and potentially outside of the USA.</p>
<p> Bear in mind, also, that in some cultures this is still problematic, such as for Icelanders, who don't actually have family names, but have a given name and a patronymic (see <a href="https://www.w3.org/International/questions/qa-personal-names.en.html#patronymic">Given name and patronymic</a>). However, short of highly localized customization, this is probably the best we can do for a generic solution.</p>
<div class="req" id="loc_names_extra_fields">
<p class="advisement">Consider whether it would make sense to have one or more extra fields, in addition to the full name field, where users can provide part(s) of their name that you need to use for a specific purpose.</p>
	<details class="links"><summary>source</summary>
	<p><a href="https://www.w3.org/International/questions/qa-personal-names#splitting">Strategies for splitting up names</a>, in <cite>Personal Names Around the World</cite>.</p>
	</details>
	<details class="links"><summary>examples</summary>
	<p><a href="https://www.w3.org/International/questions/qa-personal-names#examples">Examples of differences</a> in <cite>Personal Names Around the World</cite>.</p>
	</details>
</div>

	<div class="req" id="loc_names_how_contact">
	<p class="advisement">Allow for users to be asked separately how they would like to be addressed when someone contacts them.</p>
	<details class="links"><summary>source</summary>
	<p><a href="https://www.w3.org/International/questions/qa-personal-names#splitting">Strategies for splitting up names</a>, in <cite>Personal Names Around the World</cite>.</p>
	</details>
	<details class="links"><summary>examples</summary>
	<p><a href="https://www.w3.org/International/questions/qa-personal-names#examples">Examples of differences</a> in <cite>Personal Names Around the World</cite>.</p>
	</details>
	</div>

<p>For example, in some cases you may want to identify parts of a name so that you can sort a list of names alphabetically, or address them when contacting them, etc.</p>
   
<p>This extra field would also be useful for finding the appropriate name from a long list of name components, and for handling nicknames (which, for example, are commonly used to refer to people in Thailand).</p>
<p>Sometimes you may opt for separate fields because you want to be able to use part of the name to address the person directly, or refer to them. For example, when a social media app refers to "David's contacts". Or perhaps it's because you want to send them emails with their name at the top. Note that not only may you have problems due to name syntax here, but you also have to account for varying expectations around the world with regards to formality (not everyone is happy for a stranger to call them by their given name). It may be better to ask separately, when setting up a profile for example, how that person would like you to address them.</p>

<div class="req" id="loc_names_relevant_info">
<p class="advisement">If parts of a person's name are captured separately, ensure that the separate items can capture all relevant information.</p>
	<details class="links"><summary>source</summary>
	<p><a href="https://www.w3.org/International/questions/qa-personal-names#splitting">Strategies for splitting up names</a>, in <cite>Personal Names Around the World</cite>.</p>
	</details>
	<details class="links"><summary>examples</summary>
	<p><a href="https://www.w3.org/International/questions/qa-personal-names#examples">Examples of differences</a> in <cite>Personal Names Around the World</cite>.</p>
	</details>
</div>

<p>For example, don't assume that the order they  provide names in will be 'given name' followed by 'family name', or that it will be possible in a name that is composed of multiple words to even identify which part fits into which of those categories and which parts relate to something completely different, such as a father's name, a village name, a clan name, etc.</p>

<div class="req" id="loc_names_algorithms">
	<p class="advisement">Be careful about assumptions built into algorithms that pull out the parts of a name automatically.</p>
	<details class="links"><summary>source</summary>
	<p><a href="https://www.w3.org/International/questions/qa-personal-names#splitting">Strategies for splitting up names</a>, in <cite>Personal Names Around the World</cite>.</p>
	</details>
    <details class="links"><summary>examples</summary>
	<p><a href="https://www.w3.org/International/questions/qa-personal-names#examples">Examples of differences</a> in <cite>Personal Names Around the World</cite>.</p>
	</details>
</div>

<p>For example, the v-card and h-card approach of implied “n” optimization could have difficulties with, say, Chinese names. The input form should be as clear as possible when telling people how to specify their name, so that you capture the data you think you need.</p>

	<div class="req" id="loc_names_initials">
	<p class="advisement">Don't assume that a single letter name is an initial.</p>
	<details class="links"><summary>source</summary>
	<p><a href="https://www.w3.org/International/questions/qa-personal-names#splitting">Strategies for splitting up names</a>, in <cite>Personal Names Around the World</cite>.</p>
	</details>
	<details class="links"><summary>examples</summary>
	<p><a href="https://www.w3.org/International/questions/qa-personal-names#examples">Examples of differences</a> in <cite>Personal Names Around the World</cite>.</p>
	</details>
	</div>

<p>People do have names that are one letter long. These people can have problems if the form validator refuses to accept their name and demands that they supply their name in full. If you want to encourage people not to use initials, perhaps you should make that a warning message, rather than block the form submission.</p>

	<div class="req" id="loc_names_famname_reqd">
	<p class="advisement">Don't require that people supply a family name.</p>
	<details class="links"><summary>source</summary>
	<p><a href="https://www.w3.org/International/questions/qa-personal-names#splitting">Strategies for splitting up names</a>, in <cite>Personal Names Around the World</cite>.</p>
	</details>
	<details class="links"><summary>examples</summary>
	<p><a href="https://www.w3.org/International/questions/qa-personal-names#examples">Examples of differences</a> in <cite>Personal Names Around the World</cite>.</p>
	</details>
	</div>

<p> In cultures such as parts of Southern India, Malaysia, and Indonesia, a large number of people have names that consist of a given name only, with no patronym. If you require family names, you may create significant problems in these cultures, as users enter garbage data like &quot;.&quot; or &quot;Mr.&quot; in the family name field just to escape the form.</p>
</section>


<section id="loc_names_allowable">
<h4>Allowable characters</h4>

	<div class="req" id="loc_names_punctuation">
	<p class="advisement">Allow people to use punctuation such as hyphens, apostrophes, etc. in names, and take into account possible alternative code points for those characters.</p>
	<details class="links"><summary>source</summary>
	<p><a href="https://www.w3.org/International/questions/qa-personal-names#otherthings">Other things</a>, in <cite>Personal Names Around the World</cite>.</p>
	</details>
	<details class="links"><summary>examples</summary>
	<p><a href="https://www.w3.org/International/questions/qa-personal-names#examples">Examples of differences</a> in <cite>Personal Names Around the World</cite>.</p>
	</details>
	</div>

<p>This ensures that names are correctly handled for people such as Dina Asher-Smith and Christopher O'Connell.  Note that the apostrophe may appear as <span class="codepoint" translate="no"><bdi lang="en">&#x0027;</bdi><code class="uname">U+0027 APOSTROPHE</code></span> or as <span class="codepoint" translate="no"><bdi lang="en">&#x02BC;</bdi><code class="uname">U+02BC MODIFIER LETTER APOSTROPHE</code></span>, or perhaps even <span class="codepoint" translate="no"><bdi lang="en">&#x2019;</bdi><code class="uname">U+2019 RIGHT SINGLE QUOTATION MARK</code></span>.  A hyphen may be represented using <span class="codepoint" translate="no"><bdi lang="en">&#x002D;</bdi><code class="uname">U+002D HYPHEN-MINUS</code></span> or <span class="codepoint" translate="no"><bdi lang="en">&#x2010;</bdi><code class="uname">U+2010 HYPHEN</code></span> or, in Japan, <span class="codepoint" translate="no"><bdi lang="ja">&#x30A0;</bdi><code class="uname">U+30A0 KATAKANA-HIRAGANA DOUBLE HYPHEN</code></span>.</p>
<div class="req" id="loc_names_uppercase">
<p class="advisement">Don't require names to be entered all in upper case.</p>
<details class="links"><summary>source</summary>
<p><a href="https://www.w3.org/International/questions/qa-personal-names#otherthings">Other things</a>, in <cite>Personal Names Around the World</cite>.</p>
</details>
<details class="links"><summary>examples</summary>
<p><a href="https://www.w3.org/International/questions/qa-personal-names#examples">Examples of differences</a> in <cite>Personal Names Around the World</cite>.</p>
</details>
</div>

	<div class="req" id="loc_names_case_normalization">
	<p class="advisement">Don't normalize the casing in names.</p>
	<details class="links"><summary>source</summary>
	<p><a href="https://www.w3.org/International/questions/qa-personal-names#otherthings">Other things</a>, in <cite>Personal Names Around the World</cite>.</p>
	</details>
	<details class="links"><summary>examples</summary>
	<p><a href="https://www.w3.org/International/questions/qa-personal-names#examples">Examples of differences</a> in <cite>Personal Names Around the World</cite>.</p>
	</details>
	</div>

<p>Some names (such as 'McNamara') contain capital letters that are not the first letter; others (such as 'van der Waals') include words that are not capitalized. Forms should preserve the case the user enters and not coerce such names to always or only use capital letters at the start of each word.</p>

	<div class="req" id="loc_names_spaces">
	<p class="advisement">Allow the user to enter a name with spaces.</p>
	<details class="links"><summary>source</summary>
	<p><a href="https://www.w3.org/International/questions/qa-personal-names#otherthings">Other things</a>, in <cite>Personal Names Around the World</cite>.</p>
	</details>
	<details class="links"><summary>examples</summary>
	<p><a href="https://www.w3.org/International/questions/qa-personal-names#examples">Examples of differences</a> in <cite>Personal Names Around the World</cite>.</p>
	</details>
	</div>

<p>Allows correct capture of a family name such as that of Gabriel García Márquez (family name, García Márquez), or a given name such as José María Olazábal (family name, Olazábal).</p>
</section>


<section id="loc_names_other">
<h4>Other considerations</h4>


	<div class="req" id="loc_names_same_names">
	<p class="advisement">Don't assume that members of the same family will share the same family name.</p>
	<details class="links"><summary>source</summary>
	<p><a href="https://www.w3.org/International/questions/qa-personal-names#otherthings">Other things</a>, in <cite>Personal Names Around the World</cite>.</p>
	</details>
	<details class="links"><summary>examples</summary>
	<p><a href="https://www.w3.org/International/questions/qa-personal-names#examples">Examples of differences</a> in <cite>Personal Names Around the World</cite>.</p>
	</details>
	</div>

<p>It would be wrong to assume that members of the same family share the same family name. There is a growing trend in the West for individuals to keep their own name after marriage, but there are other cultures, such as China, where this is the normal approach. In some countries the wife may or may not take the husband's name.</p>

<p>When dealing with Hispanic names it may be that only the children in the family have the same family names, but they may be different from each of the parents.  Manuel Pérez Quiñones derived his apellidos (Pérez Quiñones) because his father's apellidos were Pérez Rodríguez and his mother's apellidos were Quiñones Alamo. In time, he courted a girl with the apellidos Padilla Falto. When they married, her apellidos became Padilla de Pérez. Their children were called Pérez Padilla, and so on.</p>

	<div class="req" id="loc_names_nee">
	<p class="advisement">It may be better for a form to ask for 'Previous name' rather than 'Maiden name' or 'née'.</p>
	<details class="links"><summary>source</summary>
	<p><a href="https://www.w3.org/International/questions/qa-personal-names#otherthings">Other things</a>, in <cite>Personal Names Around the World</cite>.</p>
	</details>
	<details class="links"><summary>examples</summary>
	<p><a href="https://www.w3.org/International/questions/qa-personal-names#examples">Examples of differences</a> in <cite>Personal Names Around the World</cite>.</p>
	</details>
	</div>

<p>You should also not simply assume that name adoption goes from husband to wife. Sometimes men take their wife's name on marriage. It may be better, in these cases, for a form to say 'Previous name' than 'Maiden name' or 'née'.</p>


	<div class="req" id="loc_names_latin_native">
	<p class="advisement">You probably need to store the name in both Latin and native scripts, in which case you will need to ask the user to submit their name in both native script and Latin-only form, as separate items.</p>
	<details class="links"><summary>source</summary>
	<p><a href="https://www.w3.org/International/questions/qa-personal-names#encoding">Implications for character support</a>, in <cite>Personal Names Around the World</cite>.</p>
	</details>
	<details class="links"><summary>examples</summary>
	<p><a href="https://www.w3.org/International/questions/qa-personal-names#examples">Examples of differences</a> in <cite>Personal Names Around the World</cite>.</p>
	</details>
	</div>

<p>The need for multiple fields will depend to some extent on what you are collecting people’s names for, and how you intend to use them.</p>
<ul>
<li>Are you collecting the person’s name just to have an identifier in your system?  If so, it may not matter whether the name is stored in ASCII-only or native script.</li>

<li>Or will they be called by name on a welcome page or in correspondence?  If you will correspond using their name on pages written in their language, it would seem sensible to have the name in the native script.</li>

<li>Is it important for people in the organization that handles queries to be able to recognize and use the person’s name?  If so, you may need a Latin transcription.</li>

<li>Will their name be displayed or searchable (for example Flickr optionally shows people’s names as well as their user name on their profile page)?  Or will you want to send them correspondence in their own language, but track them in your back-office in a language such as English? If so, it may be necessary to store the name in both Latin and native scripts, in which case you probably need to ask the user to submit their name in both native script and Latin-only form, using separate fields.</li>
</ul>


	<div class="req" id="loc_names_transcription">
	<p class="advisement">Provide a field for a transcription of the name, where necessary.</p>
	<details class="links"><summary>source</summary>
	<p><a href="https://www.w3.org/International/questions/qa-personal-names#encoding">Implications for character support</a>, in <cite>Personal Names Around the World</cite>.</p>
	</details>
	<details class="links"><summary>examples</summary>
	<p><a href="https://www.w3.org/International/questions/qa-personal-names#examples">Examples of differences</a> in <cite>Personal Names Around the World</cite>.</p>
	</details>
	</div>

<p>For example, Japanese users may need to provide a transcription in a Japanese syllabic script rather than/in addition to the ideographic form. This field is used for sorting Japanese names, but also allows someone looking at the name to check how it is pronounced.</p>


	<div class="req" id="loc_names_unexpected">
	<p class="advisement">Don't block unusual or unexpected names when trying to enforce real name usage.</p>
	</div>

<p>It isn't hard to find examples of people who have been blocked from using a service because their name doesn't conform to expectations of the developers. If you are planning to enforce real name usage, you need to allow a mechanism for people to validate their actual names if their name is rare, or has an unexpected structure.</p>
</section>

<section id="personal_name_examples">
	<h4>Using personal names in examples</h4>

	<div class="req" id="loc_names_global_audience">
	<p class="advisement">In standards and standards related documents containing examples that include names of persons, use a variety of names to reflect a global audience. Avoid a bias of names specific to certain regions.</p>
	</div>
	
<p>Many specifications provide examples, such as user stories or use cases, that use personal names as a means of enhancing the narrative. Some groups even have practices, such as security specifications using the names "Alice" and "Bob", to provide a certain level of consistency. Inclusiveness should be an important goal when building systems and services, hence the suggestion to use globally diverse names in forming examples. This helps ensure that we represent the worldwide community of users with our technologies, and makes the specification appear more relevant to the global user.</p>

<p>Try to choose names that represent people from different regions around the world, rather than just a handful of names with European origins. Note that choosing names that include non-ASCII characters can help remind implementers that Unicode support and other internationalization concerns apply to their users.</p>

<p>No collection of names can be completely agnostic in dealing with cultural and gender-related issues. To assist specification writers in creating more inclusive examples, this document provides a collection of names drawn from across many cultures. These names are organized approximately into regions, usually indicating country or language. Notice that even within these regions there are quite diverse influences and practices for the handling of personal names. The names are also divided by their cultural gender association to assist specification authors in writing examples, although many names are not specific to any particular gender.</p>

<p>Inserting personal names from other cultures into English-language examples is also affected by the very different ways that names are used culturally around the world. For example, some cultures expect the use of a patronym/matronym in addition to the given name; or some cultures prefer more formal names (e.g. "<em>Herr Dürer</em>" vs. the informal "<em>Albrecht</em>").</p>

<p>Chinese people almost never use their given name without also including their family name. When writing examples in Chinese, one might see something like <span lang="zh-Hans">路人甲</span> (it means Person A, using the Han "Heavenly Stem" ordinals, cf. <a href="https://www.w3.org/TR/predefined-counter-styles/#cjk-heavenly-stem">Ready-made Counter Styles</a>) rather than a "exemplar name". When examples are used, they include both the family and given name. Bear in mind that in Chinese the family name comes first, before the given name.</p>

<p>In Japanese, there are complex choices related to levels of formality. A person <em>might</em> be addressed by their given name in very informal situations (<em>Hiroshi</em>), but usually will be addressed with a family name that includes (unless one is being rude) a title or suffix, such as <code>-san</code> or <code>-sama</code> (e.g. <em>Tanaka-san</em>). Other suffixes or titles are also used, such as <em>senpai</em> or <em>sensei</em> (for senior or very esteemed individuals) or <em>shi</em> (when one is unfamiliar with the person). Thus an example in English that could say <em>Suppose Hiroki wants to set up a...</em> would probably be more culturally appropriate if it read <em>Suppose Tanaka-san wants to set up a...</em></p>

<h5>Example names</h5>
	
<p>The following table was compiled by the Internationalization Working Group. Contributions and suggestions for additions or corrections are welcome.</p>

	<p>The purpose of this collection of names is to assist specification authors who are generally writing for an English-speaking audience. The collection consists primarily of given names and, where necessary, is transliterated into the Latin script. The names are also rendered informally (<em>"Alice"</em> rather than <em>"Ms. Jones"</em>), even though this is not how names would be used in many of these cultures. When translating specifications, adjustments should be made which are appropriate for the target audience.</p>
	
	<p>When names are taken from non-Latin-script languages or cultures, the non-Latin representation is also provided as a reminder that names are in no way limited to the Latin script or for cases where you want to include a non-Latin script example.</p>

<p><em>This table can be sorted by clicking on the &#x25b3; or &#x25bd; arrows in the header row.</em></p>

<table id="exampleNamesTable" class="exampleNamesTable">
	<thead>
		<tr>
			<th>Name <span onclick="sortTable('exampleNamesTable', 0, false)">&#x25b3;</span><span onclick="sortTable('exampleNamesTable', 0, true)">&#x25bd;</span></th>
			<th>Native <span onclick="sortTable('exampleNamesTable', 1, false)">&#x25b3;</span><span onclick="sortTable('exampleNamesTable', 1, true)">&#x25bd;</span></th>
			<th>Gender <span onclick="sortTable('exampleNamesTable', 2, false)">&#x25b3;</span><span onclick="sortTable('exampleNamesTable', 2, true)">&#x25bd;</span></th>
			<th>Region and Notes <span onclick="sortTable('exampleNamesTable', 3, false)">&#x25b3;</span><span onclick="sortTable('exampleNamesTable', 3, true)">&#x25bd;</span></th>
			<th>Language <span onclick="sortTable('exampleNamesTable', 4, false)">&#x25b3;</span><span onclick="sortTable('exampleNamesTable', 4, true)">&#x25bd;</span></th>
		</tr>
	</thead>
	<tbody>
<tr>
    <td>Akamu</td>
    <td></td>
    <td>m</td>
    <td>Oceania; Polynesia; Hawaiian name</td>
    <td>haw
<tr>
    <td>Alinta</td>
    <td></td>
    <td>f</td>
    <td>Oceania; Australian indigenous name</td>
    <td>nys
<tr>
    <td>Amélie</td>
    <td></td>
    <td>f</td>
    <td>Europe; France</td>
    <td>fr
<tr>
    <td>An</td>
    <td lang=ja>杏</td>
    <td>f</td>
    <td>East Asia; Japan</td>
    <td>ja
<tr>
    <td>Aoi</td>
    <td lang="ja">葵; 蒼; 碧</td>
    <td>f, m</td>
    <td>East Asia; Japan</td>
    <td>ja
<tr>
    <td>Aroha</td>
    <td></td>
    <td>f</td>
    <td>Oceania; Maori</td>
    <td>mi
<tr>
    <td>Asahi</td>
    <td lang="ja">朝陽</td>
    <td>m</td>
    <td>East Asia; Japan</td>
    <td>ja
<tr>
    <td>Atlahua</td>
    <td></td>
    <td>m</td>
    <td>Latin America; Nahuatl name</td>
    <td>nah
<tr>

    <td>Åsa</td>
    <td></td>
    <td>f</td>
    <td>Europe; Sweden</td>
    <td>sv
<tr>
    <td>Beata</td>
    <td></td>
    <td>f</td>
    <td>Europe; Multiple countries</td>
    <td>it, de, pl, sv, etc.
<tr>
    <td>Chanda</td>
    <td lang="hi">चंदा</td>
    <td>f</td>
    <td>South Asia; originally from Sanskrit</td>
    <td>sa
<tr>
    <td>Chirapathi</td>
    <td lang="ta">சிரபதி</td>
    <td>f</td>
    <td>South Asia; Tamil</td>
    <td>ta
<tr>
    <td>Citlali</td>
    <td></td>
    <td>f</td>
    <td>Latin America; Nahuatl</td>
    <td>nah
<tr>
    <td>Coen</td>
    <td></td>
    <td>m</td>
    <td>Europe; Netherlands; also Oceania (Australian indigenous) or Hebrew name</td>
    <td>nl, he, nys
<tr>
    <td>Daisho</td>
    <td lang="ja">大翔</td>
    <td>m</td>
    <td>East Asia; Japan</td>
    <td>ja
<tr>
    <td>Dara</td>
    <td></td>
    <td>f</td>
    <td>West Asia; Europe; Türkiye</td>
    <td>tr
<tr>
    <td>Eva</td>
    <td>Е́ва</td>
    <td>f</td>
    <td>Europe; Russia</td>
    <td>ru
<tr>
    <td>Faheem</td>
    <td lang="ar" dir=rtl>فهيم</td>
    <td>m</td>
    <td>West Asia; Arabic</td>
    <td>ar
<tr>
    <td>Fátima</td>
    <td lang="ar" dir=rtl>فَاطِمَة</td>
    <td>f</td>
    <td>West Asia; Arabic; also used in several European cultures in the Latin script</td>
    <td>ar
<tr>
    <td>Genet</td>
    <td lang="am">ገነት</td>
    <td>f</td>
    <td>Africa; Ethiopia</td>
    <td>am
<tr>
    <td>Haruto</td>
    <td lang="ja">陽翔</td>
    <td>m</td>
    <td>East Asia; Japan</td>
    <td>ja
<tr>
    <td>Haukea</td>
    <td></td>
    <td>f</td>
    <td>Oceania; Polynesia; Hawaiian name</td>
    <td>haw
<tr>
    <td>Himari</td>
    <td lang="ja">陽葵</td>
    <td>f</td>
    <td>East Asia; Japan</td>
    <td>ja
<tr>
    <td>Hina</td>
    <td lang="ja">陽菜</td>
    <td>f</td>
    <td>East Asia; Japan</td>
    <td>ja
<tr>
    <td>Hīnano</td>
    <td></td>
    <td>m</td>
    <td>Oceania; Polynesia; Tahitian</td>
    <td>ty
<tr>
	<td>Hua</td>
	<td>李华</td>
	<td>m</td>
	<td>East Asia; China</td>
	<td>zh-Hans
<tr>
    <td>Iakopo</td>
    <td></td>
    <td>m</td>
    <td>Oceania; Samoa</td>
    <td>sm
<tr>
    <td>Ilango</td>
    <td lang="ta">இளங்கோ</td>
    <td>m</td>
    <td>South Asia; Tamil</td>
    <td>ta
<tr>
    <td>Irepani</td>
    <td></td>
    <td>m</td>
    <td>Latin America; Purepecha language</td>
    <td>tsz
<tr>
    <td>Işık</td>
    <td></td>
    <td>f</td>
    <td>West Asia; Europe; Türkiye</td>
    <td>tr
<tr>
    <td>Işıtan</td>
    <td></td>
    <td>m</td>
    <td>West Asia; Europe; Türkiye</td>
    <td>tr
<tr>
    <td>Itsuki</td>
    <td lang="ja">樹</td>
    <td>m</td>
    <td>East Asia; Japan</td>
    <td>ja
<tr>
    <td>Jarra, Jarrah, Cerrah</td>
    <td lang="ar" dir="rtl">جراح</td>
    <td>m</td>
    <td>West Asia; Arabic</td>
    <td>ar, tr
<tr>
    <td>Jean-François</td>
    <td></td>
    <td>m</td>
    <td>Europe; French</td>
    <td>fr
<tr>
    <td>João</td>
    <td></td>
    <td>m</td>
    <td>Latin America; Brazil</td>
    <td>pt-BR
<tr>
    <td>Júlía</td>
    <td></td>
    <td>f</td>
    <td>Europe; Iceland</td>
    <td>is
<tr>
    <td>Kai</td>
    <td></td>
    <td>f, m</td>
    <td>Oceania; Australia; <em>appears in many languages and is a good general example</em></td>
    <td>aus, sm
<tr>
    <td>Khaliun</td>
    <td></td>
    <td>f, m</td>
    <td>East Asia; Mongolia</td>
    <td>mn
<tr>
    <td>Kylie</td>
    <td></td>
    <td>f</td>
    <td>Oceania; Australian indigenous name</td>
    <td>aus
<tr>
    <td>Lani</td>
    <td></td>
    <td>f</td>
    <td>Oceania; Philippines</td>
    <td>fil
<tr>
    <td>Lei</td>
    <td lang="zh-Hans">李雷</td>
    <td>m</td>
    <td>East Asia; China</td>
    <td>zh-Hans
<tr>
    <td>Livia</td>
    <td></td>
    <td>f</td>
    <td>Europe, Latin America</td>
    <td>es
<tr>
    <td>Lowanna</td>
    <td></td>
    <td>f</td>
    <td>Oceania; Australian indigenous</td>
    <td>aus
<tr>
    <td>Lucas</td>
    <td></td>
    <td>m</td>
    <td>Latin America</td>
    <td>es
<tr>
    <td>Maevarau</td>
    <td></td>
    <td>m</td>
    <td>Oceania; Samoa</td>
    <td>sm
<tr>
    <td>Mahmut</td>
    <td></td>
    <td>m</td>
    <td>West Asia; Europe; Türkiye</td>
    <td>tr
<tr>
    <td>Martina</td>
    <td></td>
    <td>f</td>
    <td>Latin America</td>
    <td>es
<tr>
    <td>Mei</td>
    <td><span lang="ja">芽依</span> (<code>ja</code>); <span lang="zh-Hans">梅</span> (<code>zh</code>)</td>
    <td>f</td>
    <td>East Asia; China; Japan</td><!-- china before japan because the two lists sort next to each other -->
    <td>ja, zh
<tr>
    <td>Minato</td>
    <td lang="ja">湊</td>
    <td>m</td>
    <td>East Asia; Japan</td>
    <td>ja
<tr>
    <td>Mio</td>
    <td lang="ja">澪</td>
    <td>f</td>
    <td>East Asia; Japan</td>
    <td>ja
<tr>
    <td>Miriam</td>
    <td dir=rtl>מרים</td>
    <td>f</td>
    <td>West Asia; Hebrew</td>
    <td>he
<tr>
    <td>Müge</td>
    <td></td>
    <td>f</td>
    <td>West Asia; Europe; Türkiye</td>
    <td>tr
<tr>
    <td>Muhammad</td>
    <td lang="ar" dir=rtl>محمد</td>
    <td>m</td>
    <td>West Asia; Arabic; Many variants and languages.</td>
    <td>ar
<tr>
    <td>Ngatemi</td>
    <td></td>
    <td>f</td>
    <td>Oceania; Indonesia</td>
    <td>id, ms
<tr>
    <td>Thị Anh</td>
    <td></td>
    <td>f</td>
    <td>South-East Asia; Vietnam</td>
    <td>vi-VN
<tr>
    <td>Văn Hoa</td>
    <td></td>
    <td>m</td>
    <td>South-East Asia; Vietnam</td>
    <td>vi-VN
<tr>
    <td>Onosaʻi</td>
    <td></td>
    <td>f</td>
    <td>Oceania; Samoa</td>
    <td>sm
<tr>
    <td>Potira</td>
    <td></td>
    <td>f</td>
    <td>Latin America; Brazil; indigenous name</td>
    <td>gn
<tr>
    <td>Qiàn</td>
    <td lang="zh-Hans">倩</td>
    <td>f</td>
    <td>East Asia; China</td>
    <td>zh-Hans
<tr>
    <td>Rattiya</td>
    <td lang="th">รัตติยา</td>
    <td>f</td>
    <td>South-East Asia; Thailand</td>
    <td>th
<tr>
    <td>Ren</td>
    <td lang="ja">蓮</td>
    <td>m</td>
    <td>East Asia; Japan</td>
    <td>ja
<tr>
    <td>Rin</td>
    <td lang="ja">凛</td>
    <td>f</td>
    <td>East Asia; Japan</td>
    <td>ja
<tr>
    <td>Ritthichai</td>
    <td lang="th">ฤทธิชัย</td>
    <td>m</td>
    <td>South-East Asia; Thailand</td>
    <td>th
<tr>
    <td>Santiago</td>
    <td></td>
    <td>m</td>
    <td>Latin America</td>
    <td>es
<tr>
    <td>Senthil</td>
    <td lang="ta">செந்தில்</td>
    <td>m</td>
    <td>South Asia; Tamil</td>
    <td>ta
<tr>
    <td>Sione</td>
    <td></td>
    <td>m</td>
    <td>Oceania; Tonga</td>
    <td>to
<tr>
    <td>Slobodan</td>
    <td lang="sr-Cyrl">Слободан</td>
    <td>m</td>
    <td>Europe; Serbian</td>
    <td>sr
<tr>
    <td>Sofia</td>
    <td></td>
    <td>f</td>
    <td>Europe; Latin America</td>
    <td>es
<tr>
    <td>Tahnee</td>
    <td></td>
    <td>f</td>
    <td>Oceania; Australian indigenous</td>
    <td>aus
<tr>
    <td>Tamizhachi</td>
    <td lang="ta">தமிழச்சி</td>
    <td>f</td>
    <td>South Asia; Tamil</td>
    <td>ta
<tr>
    <td>Temuera</td>
    <td></td>
    <td>m</td>
    <td>Oceania; Polynesia</td>
    <td>sm
<tr>
    <td>Tuulikki</td>
    <td></td>
    <td>f</td>
    <td>Europe; Finland</td>
    <td>fi
<tr>
    <td>Uriel</td>
    <td lang="he" dir="rtl">אוּרִיאֵל</td>
    <td>m</td>
    <td>West Asia; Hebrew</td>
    <td>he
<tr>
    <td>Vasa</td>
    <td></td>
    <td>m</td>
    <td>Oceania; Samoa; Europe; diminutive form of Vasilije/<span lang="sr-Cyrl">Василије</span></td>
    <td>sm, hr, sr
<tr>
    <td>Vassilios</td>
    <td lang="el">Βασίλειος</td>
    <td>m</td>
    <td>Europe; Greek</td>
    <td>el
<tr>
    <td>Voula</td>
    <td lang="el">Βούλα</td>
    <td>f</td>
    <td>Europe; Greek</td>
    <td>el
<tr>
    <td>Wafaa</td>
    <td lang="ar" dir=rtl>وفاء</td>
    <td>f</td>
    <td>West Asia; Arabic</td>
    <td>ar
<tr>
    <td>Wissam</td>
    <td lang="ar" dir=rtl>وسام</td>
    <td>m</td>
    <td>West Asia; Arabic</td>
    <td>ar
<tr>
    <td>Xiaoxia</td>
    <td lang="zh-Hans">晓霞</td>
    <td>f</td>
    <td>East Asia; China</td>
    <td>zh-Hans
<tr>
    <td>Xóchitl</td>
    <td></td>
    <td>f</td>
    <td>Latin America; Nahuatl</td>
    <td>nah
<tr>
    <td>Yevdokia</td>
    <td lang="ru">Евдокия</td>
    <td>f</td>
    <td>Europe; Russia</td>
    <td>ru
<tr>
    <td>Yevgeny</td>
    <td lang="ru">Евгений</td>
    <td>m</td>
    <td>Europe; Russia</td>
    <td>ru
<tr>
    <td>Zafirah</td>
    <td lang="ar" dir="rtl">زفره</td>
    <td>f</td>
    <td>West Asia; Arabic</td>
    <td>ar</td>
</tr>
</tbody>
</table>

<aside class="note">
    <p>Another potential source for example names is Unicode's <cite>Common Locale Data Repository</cite> [[CLDR]] project, which maintains data about the presentation of personal names in different [= locales =] and cultures. That set of names can be accessed on CLDR's <a href="https://www.unicode.org/cldr/charts/latest/by_type/miscellaneous.person_name_formats.html#SampleName_Fields_for_Item:_nativeG">charts page</a>. Note that those names were collected with a focus on providing examples of the <em>formatting</em> of personal names, rather than as examples of typical names found in these cultures. For example, many of the names are variations on the name "Sinbad", even though "Sinbad" is not a commonly used name in many of these cultures. Other names are taken from sources, such as folk tales, that would be familiar in a given culture but not representative of actual persons.</p>
</aside>

<!-- start CLDR data -->
<!--  commented out as a WG decision in teleconference of 2022-12-15
      also removed this from the body onload: sortTable('cldrExampleNamesTable', 0, false);
<h5>CLDR sample names</h5>

<p>The following list of names is taken from Unicode's <cite>Common Locale Data Repository</cite> [[CLDR]] project, which maintains data about the presentation of personal names in different [= locales =] and cultures. In addition to the table below, this set of names can be accessed on CLDR's <a href="https://unicode-org.github.io/cldr-staging/charts/latest/by_type/miscellaneous.person_name_formats.html#SampleName_Fields_for_Item:_givenOnly">charts page</a>. Note that these names were collected with a focus on providing examples of the <em>formatting</em> of personal names, rather than as examples of typical names found in these cultures. For example, many of the names are variations on the name "Sinbad", even though "Sinbad" is not a commonly used name in many of these cultures. Other names are taken from sources, such as folk tales, that would be familiar in a given culture but not representative of actual persons.</p>

<p><em>This table can be sorted by clicking on the &#x25b3; or &#x25bd; arrows in the header row.</em></p>
<table id="cldrExampleNamesTable" class="exampleNamesTable">
	<thead>
		<tr>
			<th>Name <span onclick="sortTable('cldrExampleNamesTable', 0, false)">&#x25b3;</span><span onclick="sortTable('cldrExampleNamesTable', 0, true)">&#x25bd;</span></th>
			<th>Native <span onclick="sortTable('cldrExampleNamesTable', 1, false)">&#x25b3;</span><span onclick="sortTable('cldrExampleNamesTable', 1, true)">&#x25bd;</span></th>
			<th>Gender <span onclick="sortTable('cldrExampleNamesTable', 2, false)">&#x25b3;</span><span onclick="sortTable('cldrExampleNamesTable', 2, true)">&#x25bd;</span></th>
			<th>Region and Notes <span onclick="sortTable('cldrExampleNamesTable', 3, false)">&#x25b3;</span><span onclick="sortTable('cldrExampleNamesTable', 3, true)">&#x25bd;</span></th>
			<th>Language <span onclick="sortTable('cldrExampleNamesTable', 4, false)">&#x25b3;</span><span onclick="sortTable('cldrExampleNamesTable', 4, true)">&#x25bd;</span></th>
		</tr>
	</thead>
	<tbody>
 <tr>
  <td>Anh</td>
  <td></td>
  <td>f</td>
  <td>Southeast Asia; Vietnamese</td>
  <td>vi</td>
 </tr>
 <tr>
  <td>Bjørn</td>
  <td></td>
  <td>m</td>
  <td>Europe; Danish</td>
  <td>da</td>
 </tr>
 <tr>
  <td>Blerim</td>
  <td></td>
  <td></td>
  <td>Europe; Albanian</td>
  <td>sq</td>
 </tr>
 <tr>
  <td>Calum</td>
  <td></td>
  <td>m</td>
  <td>Europe; Gaelic</td>
  <td>gd</td>
 </tr>
 <tr>
  <td>Diego</td>
  <td></td>
  <td>m</td>
  <td>Latin American Spanish</td>
  <td>es-419</td>
 </tr>
 <tr>
  <td>F&#259;t-Frumos</td>
  <td></td>
  <td>m</td>
  <td>Europe; Romania</td>
  <td>ro</td>
 </tr>
 <tr>
  <td>Frantziscu</td>
  <td></td>
  <td>m</td>
  <td>Europe; Sardinian</td>
  <td>sc</td>
 </tr>
 <tr>
  <td>Gurban</td>
  <td></td>
  <td>m</td>
  <td>Central Asia; Tajik, Azerbaijani</td>
  <td>tk</td>
 </tr>
 <tr>
  <td>Harald</td>
  <td></td>
  <td>m</td>
  <td>Europe; Norwegian</td>
  <td>no</td>
 </tr>
 <tr>
  <td>Hassan</td>
  <td></td>
  <td>m</td>
  <td>Africa; Swahili</td>
  <td>sw</td>
 </tr>
 <tr>
  <td>Ivan</td>
  <td></td>
  <td>m</td>
  <td>Europe; Serbian (Latin)</td>
  <td>sr-Latn</td>
 </tr>
 <tr>
  <td>J&#257;nis</td>
  <td></td>
  <td>f, m</td>
  <td>Europe; Latvian</td>
  <td>lv</td>
 </tr>
 <tr>
  <td>Jaume</td>
  <td></td>
  <td>m</td>
  <td>Europe; Catalan</td>
  <td>ca</td>
 </tr>
 <tr>
  <td>Ji&#345;í</td>
  <td></td>
  <td>m</td>
  <td>Europe; Czech</td>
  <td>cs</td>
 </tr>
 <tr>
  <td>Jozef</td>
  <td></td>
  <td>m</td>
  <td>Europe; Slovak</td>
  <td>sk</td>
 </tr>
 <tr>
  <td>Jurij</td>
  <td></td>
  <td>m</td>
  <td>Europe; Upper Sorbian</td>
  <td>hsb</td>
 </tr>
 <tr>
  <td>Juro</td>
  <td></td>
  <td>m</td>
  <td>Europe; Lower Sorbian</td>
  <td>dsb</td>
 </tr>
 <tr>
  <td>Lena</td>
  <td></td>
  <td>f</td>
  <td>Europe; German</td>
  <td>de</td>
 </tr>
 <tr>
  <td>Liam</td>
  <td></td>
  <td>m</td>
  <td>Europe; Dutch (Flemish)</td>
  <td>nl-BE</td>
 </tr>
 <tr>
  <td>Lola</td>
  <td></td>
  <td>f</td>
  <td>Europe; Spanish</td>
  <td>es</td>
 </tr>
 <tr>
  <td>Louhi</td>
  <td></td>
  <td>f</td>
  <td>Europe; Finnish</td>
  <td>fi</td>
 </tr>
 <tr>
  <td>Marcel</td>
  <td></td>
  <td>m</td>
  <td>Europe; French</td>
  <td>fr</td>
 </tr>
 <tr>
  <td>Mari</td>
  <td></td>
  <td>f</td>
  <td>Europe; Estonian</td>
  <td>et</td>
 </tr>
 <tr>
  <td>Maria</td>
  <td></td>
  <td>f</td>
  <td>Europe; Portuguese</td>
  <td>pt</td>
 </tr>
 <tr>
  <td>Mario</td>
  <td></td>
  <td>m</td>
  <td>Italian, Quechua</td>
  <td>it, qu</td>
 </tr>
 <tr>
  <td>Piet</td>
  <td></td>
  <td>m</td>
  <td>Africa; Afrikaans</td>
  <td>af</td>
 </tr>
 <tr>
  <td>Rokas</td>
  <td></td>
  <td>m</td>
  <td>Europe, Latvia</td>
  <td>lt</td>
 </tr>
 <tr>
  <td>Sendoa</td>
  <td></td>
  <td>m</td>
  <td>Europe; Basque</td>
  <td>eu</td>
 </tr>
 <tr>
  <td>sibadì</td>
  <td></td>
  <td>m</td>
  <td>Africa; Yoruba</td>
  <td>yo, yo-BJ</td>
 </tr>
 <tr>
  <td>Sigurður</td>
  <td></td>
  <td>m</td>
  <td>Europe; Icelandic</td>
  <td>is</td>
 </tr>
 <tr>
  <td>sinbad</td>
  <td></td>
  <td>m</td>
  <td>Africa; Igbo</td>
  <td>ig</td>
 </tr>
 <tr>
  <td>Sinbad</td>
  <td></td>
  <td>m</td>
  <td>Europe, Central Asia</td>
  <td>az, bs, cy, en, fil, ga, ha, ha-NE, id, jv, nl, pcm, pl, so, tr, zu</td>
 </tr>
 <tr>
  <td>Sinbod</td>
  <td></td>
  <td>m</td>
  <td>Central Asia; Uzbek</td>
  <td>uz</td>
 </tr>
 <tr>
  <td>Slavko</td>
  <td></td>
  <td>m</td>
  <td>Europe; Hungarian</td>
  <td>hr</td>
 </tr>
 <tr>
  <td>Stefan</td>
  <td></td>
  <td>m</td>
  <td>Europe; Swedish</td>
  <td>sv</td>
 </tr>
 <tr>
  <td>Sulaiman</td>
  <td></td>
  <td>m</td>
  <td>Southeast Asia; Malay</td>
  <td>ms</td>
 </tr>
 <tr>
  <td>Svarun</td>
  <td></td>
  <td></td>
  <td>Europe; Slovenian</td>
  <td>sl</td>
 </tr>
 <tr>
  <td>T&#275;vita</td>
  <td></td>
  <td></td>
  <td>Oceania; Tongan</td>
  <td>to</td>
 </tr>
 <tr>
  <td>Uxía</td>
  <td></td>
  <td>f</td>
  <td>Europe; Galician</td>
  <td>gl</td>
 </tr>
 <tr>
  <td>Vuk</td>
  <td></td>
  <td>m</td>
  <td>Europe; Hungarian</td>
  <td>hu</td>
 </tr>
 <tr>
  <td>Yann</td>
  <td></td>
  <td>m</td>
  <td>Europe; Breton</td>
  <td>br</td>
 </tr>
 <tr>
  <td>Dimitris</td>
  <td>&#916;&#951;&#956;&#942;&#964;&#961;&#951;&#962;</td>
  <td></td>
  <td>Europe; Greek</td>
  <td>el</td>
 </tr>
 <tr>
  <td>Vasíl&#697;</td>
  <td>&#1042;&#1072;&#1089;&#1110;&#1083;&#1100;</td>
  <td>m</td>
  <td>Europe; Belarussian</td>
  <td>be</td>
 </tr>
 <tr>
  <td>Ivan</td>
  <td>&#1048;&#1074;&#1072;&#1085;</td>
  <td>m</td>
  <td>Europe, Bulgaria; Serbian</td>
  <td>bg, sr</td>
 </tr>
 <tr>
  <td>Krste</td>
  <td>&#1050;&#1088;&#1089;&#1090;&#1077;</td>
  <td>m</td>
  <td>Europe; Macedonian</td>
  <td>mk</td>
 </tr>
 <tr>
  <td>N&#1201;rlan</td>
  <td>&#1053;&#1201;&#1088;&#1083;&#1072;&#1085;</td>
  <td></td>
  <td>Central Asia, Kazakhstan</td>
  <td>kk</td>
 </tr>
 <tr>
  <td>Sergey</td>
  <td>&#1057;&#1077;&#1088;&#1075;&#1077;&#1081;</td>
  <td>m</td>
  <td>Europe; Russian</td>
  <td>ru</td>
 </tr>
 <tr>
  <td>Sinbad</td>
  <td>&#1057;&#1080;&#1085;&#1073;&#1072;&#1076;</td>
  <td>m</td>
  <td>East Asia; Mongolian</td>
  <td>mn</td>
 </tr>
 <tr>
  <td>Syjmyk, Symyk</td>
  <td>&#1057;&#1099;&#1081;&#1084;&#1099;&#1082;</td>
  <td>m</td>
  <td>Central Asia; Kyrgyz</td>
  <td>ky</td>
 </tr>
 <tr>
  <td>Yuriy, Ûríj</td>
  <td>&#1070;&#1088;&#1110;&#1081;</td>
  <td>m</td>
  <td>Europe; Ukrainian</td>
  <td>uk</td>
 </tr>
 <tr>
  <td>Arthur</td>
  <td>&#1329;&#1408;&#1385;&#1400;&#1410;&#1408;</td>
  <td>m</td>
  <td>Europe; Armenian</td>
  <td>hy</td>
 </tr>
 <tr>
  <td>Yonatan (Jonathan)</td>
  <td dir=rtl>&#1497;&#1493;&#1504;&#1514;&#1503;</td>
  <td>m</td>
  <td>West Asia; Hebrew</td>
  <td>he</td>
 </tr>
 <tr>
  <td>Ahmed</td>
  <td dir=rtl>&#1575;&#1581;&#1605;&#1583;</td>
  <td>m</td>
  <td>South Asia; Urdu</td>
  <td>ur</td>
 </tr>
 <tr>
  <td>Sindbad</td>
  <td dir=rtl>&#1587;&#1606;&#1576;&#1575;&#1583;</td>
  <td>m</td>
  <td>South Asia; Pashto, Sindhi</td>
  <td>ps, sd</td>
 </tr>
 <tr>
  <td>Sinbad</td>
  <td dir=rtl>&#1587;&#1606;&#1583;&#1576;&#1575;&#1583;</td>
  <td>m</td>
  <td>West Asia; Persian</td>
  <td>fa</td>
 </tr>
 <tr>
  <td>Munir</td>
  <td dir=rtl>&#1605;&#1606;&#1610;&#1585;</td>
  <td>m</td>
  <td>West Asia; Arabic</td>
  <td>ar</td>
 </tr>
 <tr>
  <td>Sinbad</td>
  <td>&#4658;&#4757;&#4707;&#4853;</td>
  <td>m</td>
  <td>Africa; Ethiopia</td>
  <td>am</td>
 </tr>
 <tr>
  <td>Raj</td>
  <td><span title=r&#257;ja>&#2352;&#2366;&#2332;</span></td>
  <td>m</td>
  <td>South Asia; Konkani</td>
  <td>kok</td>
 </tr>
 <tr>
  <td>Lalita</td>
  <td>&#2354;&#2354;&#2367;&#2340;</td>
  <td>f</td>
  <td>South Asia, Hindi</td>
  <td>hi</td>
 </tr>
 <tr>
  <td>Sindab&#257;da</td>
  <td>&#2360;&#2367;&#2306;&#2342;&#2348;&#2366;&#2342;</td>
  <td>m</td>
  <td>South Asia, Marathi</td>
  <td>mr</td>
 </tr>
 <tr>
  <td>Sundara</td>
  <td>&#2360;&#2369;&#2344;&#2381;&#2342;&#2352;</td>
  <td>f</td>
  <td>South Asia, Nepali</td>
  <td>ne</td>
 </tr>
 <tr>
  <td>Sinbad</td>
  <td>&#2458;&#2495;&#2472;&#2509;&#2470;&#2476;&#2494;&#2470;</td>
  <td>m</td>
  <td>South Asia, Assamese</td>
  <td>as</td>
 </tr>
 <tr>
  <td>Sinbad</td>
  <td>&#2488;&#2495;&#2472;&#2476;&#2494;&#2470;</td>
  <td>m</td>
  <td>South Asia, Bengali</td>
  <td>bn</td>
 </tr>
 <tr>
  <td>Sinbad</td>
  <td>&#2616;&#2623;&#2600;&#2613;&#2622;&#2598;</td>
  <td>m</td>
  <td>South Asia, Punjabi</td>
  <td>pa</td>
 </tr>
 <tr>
  <td>Sinbad</td>
  <td>&#2744;&#2751;&#2728;&#2732;&#2750;&#2726;</td>
  <td>m</td>
  <td>South Asia, Gujarati</td>
  <td>gu</td>
 </tr>
 <tr>
  <td>Sinbad</td>
  <td>&#2872;&#2879;&#2856;&#2860;&#2878;&#2854;&#2893;</td>
  <td>m</td>
  <td>South Asia; Odia</td>
  <td>or</td>
 </tr>
 <tr>
  <td>Rajendran</td>
  <td>&#2992;&#3006;&#2972;&#3015;&#2984;&#3021;&#2980;&#3007;&#2992;&#2985;&#3021;</td>
  <td>m</td>
  <td>South Asia, Tamil</td>
  <td>ta</td>
 </tr>
 <tr>
  <td>Rajasekhar</td>
  <td>&#3120;&#3134;&#3100;&#3126;&#3143;&#3094;&#3120;&#3149;</td>
  <td>m</td>
  <td>South Asia; Telegu</td>
  <td>te</td>
 </tr>
 <tr>
  <td>Sinbad</td>
  <td>&#3256;&#3263;&#3240;&#3277;&#8204;&#3244;&#3262;&#3238;&#3277;</td>
  <td>m</td>
  <td>South Asia, Kannada</td>
  <td>kn</td>
 </tr>
 <tr>
  <td>Sinbad</td>
  <td>&#3384;&#3391;&#3451;&#3372;&#3390;&#3361;&#3405;</td>
  <td>m</td>
  <td>South Asia; Malayalam</td>
  <td>ml</td>
 </tr>
 <tr>
  <td>Sinbad</td>
  <td>&#3523;&#3538;&#3505;&#3530;&#3510;&#3537;&#3497;&#3530;</td>
  <td>m</td>
  <td>South Asia; Sinhala</td>
  <td>si</td>
 </tr>
 <tr>
  <td>Thana (?)</td>
  <td>&#3608;&#3609;&#3634;</td>
  <td></td>
  <td>Southeast Asia; Thai</td>
  <td>th</td>
 </tr>
 <tr>
  <td>Sinbad</td>
  <td>&#3722;&#3764;&#3737;&#3777;&#3738;&#3732;</td>
  <td>m</td>
  <td>South Asia; Lao</td>
  <td>lo</td>
 </tr>
 <tr>
  <td>Sokha</td>
  <td>&#6047;&#6075;&#6017;&#6070;</td>
  <td></td>
  <td>South Asia; Khmer</td>
  <td>km</td>
 </tr>
 <tr>
  <td>Yumi</td>
  <td>&#50976;&#48120;</td>
  <td></td>
  <td>East Asia; Korean</td>
  <td>ko</td>
 </tr>
 <-- Dai Bun
 <tr>
  <td>dà wén</td>
  <td>&#22823;&#25991;</td>
  <td></td>
  <td>East Asia</td>
  <td>yue-Hans</td>
 </tr>
 <-- Unihan?
 <tr>
  <td>y&#468; hàn</td>
  <td>&#23431;&#28698;</td>
  <td></td>
  <td>East Asia</td>
  <td>zh</td>
 </tr>

 <tr>
  <td>Shintaro</td>
  <td>&#24910;&#22826;&#37070;</td>
  <td>m</td>
  <td>East Asia; Japanese</td>
  <td>ja</td>
 </tr>
 <tr>
  <td>Wén Jié</td>
  <td>&#25991;&#20625;</td>
  <td></td>
  <td>East Asia; Chinese, Yue</td>
  <td>yue, zh-Hant</td>
 </tr>
</tbody>
</table>

<hr>

-->

</section>
</section>


<section id="loc_numbers" class="subtopic">
<h3>Working with numbers</h3>


	<div class="req" id="loc_numbers_shape_parse">
	<p class="advisement">When parsing user input of numeric values, allow for digit shaping (non-ASCII digits).</p>
	</div>

	<div class="req" id="loc_numbers_shape_display">
	<p class="advisement">When formatting numeric values for display, allow for culturally sensitive display, including the use of non-ASCII digits (digit shaping).</p>
	</div>

	<div class="req" id="loc_numbers_lists">
	<p class="advisement">When defining a feature that automatically labels items incrementally for display to the user (such as when creating a numbered list), allow for localized presentation of the labels as well as for various counting/listing systems or styles.</p>
	<p>Examples of this can be found in <cite>CSS Counter Styles</cite> [[css-counter-styles-3]] and especially the accompanying <cite>Ready-made Counter Styles</cite> [[predefined-counter-styles]].</p>
	<details class="links"><summary>more</summary>
		<p>Defining counter styles in CSS:</p>
	<ul>
		<li><a href="https://www.w3.org/TR/css-counter-styles/">CSS Counter Styles</a>.</li>
	    <li><a href="https://www.w3.org/TR/predefined-counter-styles/">Ready-made Counter Styles</a>.</li>
	</ul>
	</details>
	</div>
</section>


<section id="loc_forms" class="subtopic">
<h3>Designing forms</h3>

<p class="reviewComments"><a href="https://github.com/w3c/i18n-activity/labels/t:loc_forms" target="_blank">See related review comments.</a></p>

<div class="req" id="loc_forms_eai">
	<p class="advisement">When defining email field validation, allow for EAI (smtputf8) names.</p>
</div>
</section>


<section id="loc_input" class="subtopic">
<h3><em>User input (TBD)</em></h3>

<p class="reviewComments"><a href="https://github.com/w3c/i18n-activity/labels/t%3Aloc_input_x" target="_blank">See related review comments.</a></p>
</section>


<section id="loc_examples" class="subtopic">
<h3><em>Creating examples (TBD)</em></h3>

<p class="reviewComments"><a href="https://github.com/w3c/i18n-activity/labels/t%3Aloc_examples_x" target="_blank">See related review comments.</a></p>
</section>


<section id="loc_localization" class="subtopic">
<h3>Localization</h3>


<p><a href="https://www.w3.org/TR/ltli/#dfn-localization">Localization</a> [[LTLI]] enables users to employ software in the language and locale of their choice. Specifications for protocols and document formats need to consider how to provide the language and formatting that the end-user expects.</p>

<p>Natural language data values need language and base direction in order to ensure proper presentation, even if localized messages are not provided. This includes any error messages or other internal messages that are human readable in an API or protocol. See also [[STRING-META]].</p>

	<div class="req" id="l10n_api_message_metadata">
	<p class="advisement">APIs and protocols SHOULD include language and string direction metadata for all <a>natural language</a> messages and data fields.</p>
	</div>

	<div class="req" id="l10n_api_message_language">
	<p class="advisement">All <a>natural language</a> fields or messages, including error messages, defined by a given API or protocol SHOULD be localized into the preferred locale of the user or, if that language is not available, supplied with a suitable fallback or default.</p>
	</div>

	<div class="req" id="l10n_api_lang_nego">
	<p class="advisement">Specifications for APIs or protocols SHOULD define how the user's locale is determined (this is sometimes called <a href="#lang_negotiation">language negotiation</a>).</p>
	</div>
	<div class="req" id="l10n_api_lang_defaulting">
	<p class="advisement">Specifications MAY define a specific default language for messages or errors in an API or protocol.</p>
	</div>

<p class="note">Specifications do not need to require that messages be returned in all possible or all available locales. It is sufficient to make it possible to localize the end-user's customer experience. Implementations can choose which languages or locales to support.</p>


<section id="errors">
<h4>Working with error and exception messages</h4>

<p>Protocols, APIs, and document formats sometimes provide a field to pass a human-readable error or exception message from a service to the caller in the form of a string. In general, and as <a href="#l10n_api_message_metadata">indicated above</a>, any natural language text conveying human-readable messages or human-readable content needs to be associated with language and direction metadata. Where this metadata is missing, the processing or display of the text might be compromised.</p>

<p>Often the intention of the specification author in providing an error or exception message is to convey debugging information to a software developer. Specification authors sometimes assume that error or exception messages are not seen by end users; that software developers will prefer these messages to be unlocalized or appear in a specific language (usually English); or that there are other "practical reasons" why localization of error messages can turn out to be a barrier. For example, there are anecdotes about developers finding it easier to search the Web with the (usually obscure) text of an error because the message itself is insufficiently good at explaining the problem. Searching for this text might produce a result in the developer's preferred language that is more helpful.</p>

<p>Error messages are messages and they are intended for humans, not machines. In many cases, the error message encompasses all of the additional information about what went wrong and, in some cases, the caller is obliged to show the message to the actual end user because there is no other way to convey to the caller about how to fix the problem ("Your credit card has expired"; "The value 10484977 is too large" [oops, forgot the decimal point]; etc.). Localization of these types of messages is actually a good thing and may even be obligatory in some applications.</p>

	<div class="req" id="l10n_api_message_id">
	<p class="advisement">APIs and protocols SHOULD provide language independent identifiers for errors.</p>
	</div>

<p>For example, HTTP result codes, such as the familiar <code>404</code>, help users communicate which error they received or look up a translation.</p>

    <div class="req" id="l10n_api_error_message_optional">
    <p class="advisement">Natural language error message fields, when provided, SHOULD be optional and SHOULD include language and direction metadata.</p>
    </div>

    <div class="req" id="l10n_api_error_message_language">
    <p class="advisement">Natural language error message fields, when provided, SHOULD match the user interface language negotiated for the request when possible.</p>
    </div>

</section>
</section>
</section>


<section id="navigation" class="topic">
<h2>Navigation</h2>

<div id="navigation_checklist" class="summaryC"></div>


<section id="lang_negotiation" class="subtopic">
<h3>Providing for  content negotiation based on language</h3>

<p class="reviewComments"><a href="https://github.com/w3c/i18n-activity/labels/t%3Alang_negotiation" target="_blank">See related review comments.</a></p>

	<div class="req" id="lang_neg">
	<p class="advisement">In a multilingual environment it must be possible for the user to receive text in the language they prefer. This may depend on implicit user preferences based on the user's system or browser setup, or on user settings explicitly negotiated with the user.</p>
	</div>
</section>
</section>


<section class="appendix" id="app-c">
<h2> Revision Log</h2>
  <p>The following summarises substantive changes since the previous publication, but the material is still subject to significant flux as it develops. This should not be a reason not to use the document. What it so far contains is useful, and any shortfalls can be reported or discussed.</p>
  <ol>
    <li>Links were added below section headings that point to lists of review comments related to that section. These comments provide details useful to developers and reviewers.</li>
<li>The checklist generator tool was moved to the top of the document.</li>
    <li>The table of contents now reports 3 levels of heading.</li>
<li>Lists of links to documents that provide useful background or an overview of a section have been moved to the start of that section. So also have 'see also' links.</li>
<li>Each advisement now carries its own set of links. This makes the links more relevant and more easily noticeable. It also makes it easier to list multiple links, and because the links indicate the target document title, readers do not have to follow the link to know whether they have already read the document pointed to.</li>
<li>Links associated with an advisement are of two types: 'explanations &amp; examples' typically points to a location in another document from which this advisement was lifted, and surrounds it there with explanatory text; 'more' links provide further reading in other documents.</li>
<li>Self-links for each advisement have been changed so that they match the standard style used for headings (§ to the side of the text). This also significantly reduces the complexity of authoring the markup.</li>
<li>Added content in the <a href="#locale">section on locales</a>, along with text about <a href="#file_naming">working with file and path names</a> and <a href="#errors">working with error messages</a>.</li>
<li>Generally, the markup in the document source has been greatly simplified, making it easier and quicker to maintain the document.</li>
  </ol>
    <p>See the <a href="https://github.com/w3c/bp-i18n-specdev/commits/gh-pages">github commit log</a> for more details.</p>
</section>


<section class="appendix" id="ack">
<h2>Acknowledgements</h2>
<p>Thanks to Addison Phillips for help reviewing old reviews for recommendations.</p>
<p>Other people who contributed through reviews or issues include Steve Atkin, Andrew Cunningham, Martin Dürst, Asmus Freytag, John Klensin, Tomer Mahlin, Chaals McCathieNevile, Florian Rivoal. Some material about locale-neutral representation was adapted from [[DWBP]].</p>
</section>


<!--button onClick="document.getElementById('dump').value = dumpData(); document.getElementById('dump').select();">Dump data</button>
<textarea id="dump" style="width: 100%; height: 400px;"></textarea-->


<script class="remove">

reqs = document.querySelectorAll('.req')
for (let i=0; i<reqs.length; i++) {
	if (reqs[i].id) {
		a = document.createElement('a')
		a.href = '#'+reqs[i].id
		a.textContent = '§'
		a.className = 'self-link'
		reqs[i].prepend(a)
		}
	}

// establish the lists at section start with checklist details
sectiontocs = document.querySelectorAll('.summaryC')
for (let i=0; i<sectiontocs.length; i++) {
	showChecklists(sectiontocs[i].parentNode, sectiontocs[i].id)
	}
</script>


</body>
</html>