First commit

thiagomajesk · Aug 13, 2024 · dafec10 · dafec10
commit dafec10
Show file tree

Hide file tree

Showing 41 changed files with 3,324 additions and 0 deletions.
diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml
@@ -0,0 +1,53 @@
+name: Deploy Zola to GitHub Pages
+
+on:
+  push:
+    branches: [master]
+  workflow_dispatch:
+
+permissions:
+  contents: read
+  pages: write
+  id-token: write
+
+concurrency:
+  group: "pages"
+  cancel-in-progress: false
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Install Zola
+        uses: taiki-e/install-action@v2
+        with: { tool: zola@0.19.1 }
+
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Install NPM packages
+        uses: actions/setup-node@v4
+        with: { node-version: 21, cache: "npm" }
+
+      - name: Build site
+        run: |
+          npm install
+          npm run build:css
+          zola build
+          touch public/.nojekyll
+
+      - name: Upload artifact
+        uses: actions/upload-pages-artifact@v3
+        with:
+          path: ./public
+
+  publish:
+    runs-on: ubuntu-latest
+    needs: build
+    environment:
+      name: github-pages
+      url: ${{ steps.deployment.outputs.page_url }}
+    steps:
+      - name: Deploy to GitHub Pages
+        id: deployment
+        uses: actions/deploy-pages@v4
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,5 @@
+themes
+build
+storage
+node_modules
+public
diff --git a/README.md b/README.md
@@ -0,0 +1,3 @@
+# Thiago's Guide To Everything
+
+Hello! I'm Thiago and this is a place where I share some of my discoveries with the world. The main purpose of this page is to serve as a storage to index my own findings and useful things I learn and usually forget after a while - like a "memory vault" if you will. Feel free to snoop around.
diff --git a/colors.js b/colors.js
@@ -0,0 +1,119 @@
+export default {
+  gray: {
+    50: "#f2f2f4",
+    100: "#d9d9dc",
+    200: "#bfbfc4",
+    300: "#a5a5ac",
+    400: "#8b8b93",
+    500: "#71717b",
+    600: "#585862",
+    700: "#3e3e4a",
+    800: "#242431",
+    900: "#191921",
+    950: "#0E0D11",
+  },
+  blue: {
+    50: "#f0f2f8",
+    100: "#d7dcec",
+    200: "#bec6df",
+    300: "#a6b0d1",
+    400: "#8f9bc3",
+    500: "#7886b4",
+    600: "#6272a4",
+    700: "#54628c",
+    800: "#485273",
+    900: "#3b425a",
+    950: "#32374c",
+  },
+  cyan: {
+    50: "#fafeff",
+    100: "#d4f8ff",
+    200: "#aff0fe",
+    300: "#8be9fd",
+    400: "#67e1fb",
+    500: "#44d9f8",
+    600: "#22d0f5",
+    700: "#0dbfe5",
+    800: "#0ca0bf",
+    900: "#0c819a",
+    950: "#0a6c80",
+  },
+  green: {
+    50: "#e2ffe9",
+    100: "#bcffcd",
+    200: "#97feb1",
+    300: "#73fc96",
+    400: "#50fa7b",
+    500: "#2df760",
+    600: "#0cf346",
+    700: "#0ccd3d",
+    800: "#0ba833",
+    900: "#0a8329",
+    950: "#086d22",
+  },
+  orange: {
+    50: "#ffefdd",
+    100: "#ffdcb7",
+    200: "#ffca92",
+    300: "#ffb86c",
+    400: "#fda648",
+    500: "#fb9325",
+    600: "#f38107",
+    700: "#cd6d08",
+    800: "#a75a08",
+    900: "#824707",
+    950: "#6c3906",
+  },
+  pink: {
+    50: "#ffeaf6",
+    100: "#ffc4e6",
+    200: "#ff9fd6",
+    300: "#ff79c6",
+    400: "#fd55b6",
+    500: "#fb31a5",
+    600: "#f80e95",
+    700: "#d90880",
+    800: "#b3086b",
+    900: "#8e0855",
+    950: "#770745",
+  },
+  purple: {
+    50: "#fefeff",
+    100: "#e9dafe",
+    200: "#d3b6fc",
+    300: "#bd93f9",
+    400: "#a771f6",
+    500: "#924ff2",
+    600: "#7d2eed",
+    700: "#6916e0",
+    800: "#5914bb",
+    900: "#491298",
+    950: "#3e107f",
+  },
+  red: {
+    50: "#ffecec",
+    100: "#ffc6c6",
+    200: "#ffa0a0",
+    300: "#ff7b7b",
+    400: "#ff5555",
+    500: "#fd3131",
+    600: "#fb0e0e",
+    700: "#dd0606",
+    800: "#b70707",
+    900: "#910707",
+    950: "#7a0606",
+  },
+  yellow: {
+    50: "#fefff8",
+    100: "#fafed3",
+    200: "#f6fcaf",
+    300: "#f1fa8c",
+    400: "#ebf769",
+    500: "#e5f347",
+    600: "#dfef26",
+    700: "#ccdd12",
+    800: "#abb811",
+    900: "#899410",
+    950: "#737b0e",
+  },
+};
diff --git a/config.toml b/config.toml
@@ -0,0 +1,20 @@
+base_url = "https://thiagomajesk.github.io"
+compile_sass = false
+build_search_index = false
+generate_feeds = true
+feed_filenames = ["rss.xml", "atom.xml"]
+
+taxonomies = [
+    { name = "categories", feed = true, paginate_by = 10 },
+    { name = "tags", feed = true, paginate_by = 10 },
+]
+
+[markdown]
+# Whether to do syntax highlighting
+# Theme can be customised by setting the `highlight_theme` variable to a theme supported by Zola
+highlight_code = true
+
+
+[extra]
+title = "Thiago's Guide To Everything"
+keywords = ["zola", "blog"]
diff --git a/content/_index.md b/content/_index.md
@@ -0,0 +1,4 @@
+---
+sort_by: date 
+paginate_by: 5
+---
diff --git a/content/archive/_index.md b/content/archive/_index.md
@@ -0,0 +1,3 @@
+---
+template: archive.html
+---
diff --git a/content/posts/2022-07-14-recovering-from-aborted-transactions-with-ecto.md b/content/posts/2022-07-14-recovering-from-aborted-transactions-with-ecto.md
@@ -0,0 +1,66 @@
+---
+title: Recovering from aborted transactions with Ecto
+date: 2022-07-14
+taxonomies:
+  categories: [development]
+  tags: [elixir, ecto, sql]
+---
+
+It's not uncommon to have the need of doing a series of database operations where one depends on the result of another. For those cases, we usually resort to transactions, which allow us to have more control over how we want to persist our data in a single consistent unit of work.
+
+<!-- more -->
+
+So, imagine we have the following scenario: We have a blog engine that diligently keeps track of all created posts. Each time we attempt to create a new Post, a Log with the status of the operation must also be created. Since this logging mechanism is so important to our blog engine, if we can't create both the Post and the Log, we discard the whole operation.
+
+In Ecto, we have two ways of working with transactions, using the [`Ecto.Repo.transaction/2`](https://hexdocs.pm/ecto/3.8.4/Ecto.Repo.html#c:transaction/2) function directly and using the [`Ecto.Multi`](https://hexdocs.pm/ecto/3.8.4/Ecto.Multi.html) module. Since we don't want to keep track of all intermediate states of the operation, Multi is a little bit overkill, so we'll go with the much simpler transaction function:
+
+```elixir
+changeset =
+  %Post{title: "Recovering from aborted transactions with Ecto"}
+  |> Ecto.Changeset.change()
+  |> Ecto.Changeset.unique_constraint([:title])
+
+Repo.transaction(fn repo ->
+  case repo.insert(changeset) do
+    {:ok, post} -> repo.insert!(%Log{status: :success})
+    {:error, changeset} -> repo.insert!(%Log{status: :failure})
+    end
+end)
+```
+
+Notice two things in the example above: The first one is that we are using `repo.insert/2` which does not raise if the Post changeset is invalid. This is important because we want to deal with the result of the operation differently on each case. The second thing is that we use `repo.insert!` to create the Log because if this operation fails, we want to raise and make the whole transaction rollback.
+
+> Here I have to stop to make a important remark about Ecto: Since Ecto is not your traditional ORM, it does not do much to help you when things go south. This might sound horrible at first, but after using it for a while you wind up realizing that it's actually a superior model. Since we are closer to the database and don't have to deal with layers uppon layers of abstraction, problems are also simpler to identify and understand.
+
+Getting back to our previous example... What happens if we already have a Post with the same title in the database? If you haven't experienced this scenario before, you might think that since we are checking for a unique constraint, the changeset becomes invalid and the result of the insert will match on the `:error` tuple. However, this is not what happens and here's why: When we use [`unique_constraint/3`](https://hexdocs.pm/ecto/3.8.4/Ecto.Changeset.html#unique_constraint/3) in our changesets, it actually relies on the database to check if the constraint has been violated or not. This means that the changeset can only be invalid **after** the database has already executed the operation. Given that the database attempted an invalid insert inside a transaction, this automatically aborts that transaction and because the transaction is aborted, by the time our code reaches the next `repo.insert!`, it throws this beautifull exception:
+
+```
+** (Postgrex.Error) ERROR 25P02 (in_failed_sql_transaction) current transaction is aborted, commands ignored until end of transaction block
+```
+
+If like me, you arleady spent a disproportional amount of time seeking to understand why this happened in the first place; I have some good news, there's at least two ways we can solve this problem:
+
+The first solution, which might be the most obvious one is to prevent the database from raising when it checks for the constraint. We can achieve this by setting the `:on_conflict` option to `:nothing`. However, bear in mind that this solution has a catch: Since we are instructing the database to do nothing when it identifies a conflict, our Post still doesn't get inserted and the result will match on `{:ok, post}`. The full solution could look like this:
+
+```elixir
+Repo.transaction(fn repo ->
+  case repo.insert(changeset, on_conflict: :nothing) do
+    {:ok, %{id: nil}} -> # do something here to deal with the edge case
+    {:ok, post} -> repo.insert!(%Log{status: :success})
+    {:error, changeset} -> repo.insert!(%Log{status: :failure})
+    end
+end)
+```
+
+The second solution, and also my favorite one uses a SQL feature called a [`SAVEPOINT`](https://en.wikipedia.org/wiki/Savepoint). In short, savepoints allow us to prevent that a database error fails the whole transaction by restoring it back to a certain state. Here's how to use it:
+
+```elixir
+Repo.transaction(fn repo ->
+  case repo.insert(changeset, mode: :savepoint) do
+    {:ok, post} -> repo.insert!(%Log{status: :success})
+    {:error, changeset} -> repo.insert!(%Log{status: :failure})
+    end
+end)
+```
+
+This simple configuration makes possible to retain the original semantics of our code and without much effort getting rid of that pesky exception - YAY!
diff --git a/content/posts/2022-10-03-storing-unicode-strings-in-the-database.md b/content/posts/2022-10-03-storing-unicode-strings-in-the-database.md
@@ -0,0 +1,115 @@
+---
+title: Storing unicode strings in the database
+date: 2022-10-03
+taxonomies:
+  categories: [development]
+  tags: [elixir, postgres, sql]
+---
+
+Recently I had to deal with a really pesky problem during the development of a client's product related to how unicode strings work. During the development of a new feature, we had the need to summarize some long-form text collected from the database.
+Because this text was migrated over by a third-party company, we had no knowledge of how it was previously stored and handled during the migration.
+
+<!-- more -->
+
+However, we knew that the original text could contain HTML markup from a rich text WISYWIG editor. So this was our first (naive) try:
+
+```elixir
+short_description =
+  description
+  |> Floki.parse_document!()
+  |> Floki.text()
+  |> String.slice(0..254)
+```
+
+First we parse the HTML fragment and retrieve the text from the element nodes. After that we simply get the first 255 characters from the text. If you are familiar with Ecto, you'll remeber that migrations with the field type `:string` [defaults](https://hexdocs.pm/ecto_sql/Ecto.Migration.html#module-field-types) to a `varchar(255)` column, which is what we used. So, after having the feature deployed to production for quite some time, we were caught by surprise with the following error:
+
+```
+ERROR 22001 (string_data_right_truncation) value too long for type character varying(255)
+```
+
+The database message is clearly saying that we were trying to store more than the column could hold (255 bytes). But why's that?
+After testing and testing some more, we had no clue what was going on. After doing our research, here's what we learned...
+
+# Couting graphemes vs actual string size
+
+If you read Elixir's [String](https://hexdocs.pm/elixir/String.html) module docs, you'll learn that some graphemes may have multiple encondings, especially accented characters in the Unicode Latin-1 script. Take these two words `sintético` and `sintético`, although very similar, they use different encodings as you can see bellow:
+
+```elixir
+String.graphemes("sintético")
+#=> ["s", "i", "n", "t", "é", "t", "i", "c", "o"]
+
+String.to_charlist("sintético")
+# => [115, 105, 110, 116, 101, 769, 116, 105, 99, 111]
+
+String.length("sintético")
+#=> 9
+```
+
+Even though they have the same length, notice how the codepoints returned by `String.to_charlist/1` are different from the previous one:
+
+```elixir
+String.graphemes("sintético")
+#=> ["s", "i", "n", "t", "é", "t", "i", "c", "o"]
+
+String.to_charlist("sintético")
+#=> [115, 105, 110, 116, 233, 116, 105, 99, 111]
+
+String.length("sintético")
+#=> 9
+```
+
+Using the unicode jargon, we say that those words are cannonically equivalent, even though they don't have the same representation internally. In the previous example, the character `é` is represented with the codepoints `[101, 769]` and the char `é` with the codepoint `[233]`.
+
+You might still be curious of why the function `String.length/1` returns the same size for both of them, even though they don't use the same codepoints. This happens because the `String` module works with graphemes instead of codepoints. In this particular case, couting graphemes is considered a better solution for UTF-8 strings because a character is counted regardless of it's internal representation. This means that `String.length/1` works more closely to what you expect when you "look" at the string.
+
+When I first learned this, I thought that Elixir was being quirky. However, after carefull examination I see that other languages like Ruby and Javascript are actually conflating the meaning of length and size; which gives you an unexpected result in this scenario (unicode-wise at least).
+
+If you need to store the previous string in a database, you might be tempted to use `String.length/1` to check for its size like we did previously. However, this is a very naive approach, because it does not take the actual size of the encoded string into consideration. Gladly, since Elixir strings are just binaries, we have a simple way to check their actual size:
+
+```elixir
+byte_size("sintético")
+#=> 10
+byte_size("sintético")
+#=> 11
+```
+
+# String normalization
+
+So, since many functions in the `String` module deal with graphemes directly, something like `String.slice/2` will return the right amount of characters like we expect, but we can't trust it to return the right amount of bytes we want to store. If you are expecting exactly 1 byte per character, you might want to normalize the string first with `String.normalize/2` so you always get the expected amount of bytes per grapheme:
+
+```elixir
+String.to_charlist(String.normalize("sintético", :nfd))
+#=> [115, 105, 110, 116, 101, 769, 116, 105, 99, 111]
+
+String.to_charlist(String.normalize("sintético", :nfd))
+#=> [115, 105, 110, 116, 101, 769, 116, 105, 99, 111]
+```
+
+If you want to know more about this, I recommend this excelent article that goes into detail on how unicode normalization works: [https://towardsdatascience.com/what-on-earth-is-unicode-normalization-56c005c55ad](https://towardsdatascience.com/what-on-earth-is-unicode-normalization-56c005c55ad0).
+
+# Finally, storing some information
+
+After going through all of this material, it finally became obvious that if we wanted to store the text regardless of how it was initially represented, we had to normalize the string first. So this what we did in our final solution:
+
+```elixir
+short_description =
+  description
+  |> Floki.parse_document!()
+  |> Floki.text()
+  |> String.slice(0..254)
+  |> String.normalize(:ndf)
+```
+
+Before we close this, I want to make final remarks regarding some Postgres peculiarities we learned in the process and leave some literature for posterity... While trying to figure out how to store our summarized text, I discovered that there's no difference in performance between storing a `char(n)`, `varchar(n)` and `text` in Postgres. The [documentation](https://www.postgresql.org/docs/9.6/datatype-character.html) states the following:
+
+> There is no performance difference among these three types, apart from increased storage space when using the blank-padded type, and a few extra CPU cycles to check the length when storing into a length-constrained column. While character(n) has performance advantages in some other database systems, there is no such advantage in PostgreSQL; in fact character(n) is usually the slowest of the three because of its additional storage costs. In most situations text or character varying should be used instead.
+
+If like me, you didn't know about this, you might perhaps make good use of this wiki page: [https://wiki.postgresql.org/wiki/Don%27t_Do_This](https://wiki.postgresql.org/wiki/Don%27t_Do_This#Don.27t_use_char.28n.29#Text_storage).
+
+BTW, the case for Ecto not doing this by default, seems to be compatibility with other databases. So, if you don't have the need to store exactly 'N' amount of characters like we did, I definetelly recommend always using the `text` type and keep the original information intact.
+
+I also want to link two very good videos on the unicode topic that will make things a lot more clear if you are dealing with UTF-8 strings:
+
+{{ youtube(id="MijmeoH9LT4") }}
+
+{{ youtube(id="ut74oHojxqo") }}
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		# Thiago's Guide To Everything

		Hello! I'm Thiago and this is a place where I share some of my discoveries with the world. The main purpose of this page is to serve as a storage to index my own findings and useful things I learn and usually forget after a while - like a "memory vault" if you will. Feel free to snoop around.