Skip to content

Commit

Permalink
Prepend HTTPS to Crawl Start URL (#2177)
Browse files Browse the repository at this point in the history
Part of fix for #2167:
- Updates validation regex to accept URLs without prefix, not just if
starting with www.
- prepend `https://` if URL is considered valid but missing scheme
- ur list: if valid, iterate and prepend `https://` to each URL in URL list, also run validation onblur for URL list
  • Loading branch information
ikreymer authored Nov 24, 2024
1 parent 76abfb3 commit ca012a4
Show file tree
Hide file tree
Showing 2 changed files with 70 additions and 56 deletions.
124 changes: 69 additions & 55 deletions frontend/src/features/crawl-workflows/workflow-editor.ts
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,6 @@ const DEFAULT_BEHAVIORS = [
"autofetch",
"siteSpecific",
];
const MAX_ADDITIONAL_URLS = 100;

const getDefaultProgressState = (hasConfigId = false): ProgressState => {
let activeTab: StepName = "crawlSetup";
Expand Down Expand Up @@ -162,7 +161,8 @@ function getLocalizedWeekDays() {
}

function validURL(url: string) {
return /((((https?):(?:\/\/)?)(?:[-;:&=+$,\w]+@)?[A-Za-z0-9.-]+|(?:www\.|[-;:&=+$,\w]+@)[A-Za-z0-9.-]+)((?:\/[+~%/.\w\-_]*)?\??(?:[-+=&;%@.\w_]*)#?(?:[.!/\\\w]*))?)/.test(
// adapted from: https://gist.github.com/dperini/729294
return /^(?:https?:\/\/)?(?:\S+(?::\S*)?@)?(?:(?!(?:10|127)(?:\.\d{1,3}){3})(?!(?:169\.254|192\.168)(?:\.\d{1,3}){2})(?!172\.(?:1[6-9]|2\d|3[0-1])(?:\.\d{1,3}){2})(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))|(?:(?:[a-z0-9\u00a1-\uffff][a-z0-9\u00a1-\uffff_-]{0,62})?[a-z0-9\u00a1-\uffff]\.)+(?:[a-z\u00a1-\uffff]{2,}\.?))(?::\d{2,5})?(?:[/?#]\S*)?$/i.test(
url,
);
}
Expand All @@ -173,7 +173,8 @@ const urlListToArray = flow(
trimArray,
);

const URL_LIST_MAX_URLS = 1000;
//todo: make this customizable, perhaps at deploy time
const URL_LIST_MAX_URLS = 100;

type CrawlConfigResponse = {
run_now_job?: boolean;
Expand Down Expand Up @@ -813,6 +814,17 @@ export class WorkflowEditor extends BtrixElement {
const text = msg("Please enter a valid URL.");
inputEl.helpText = text;
inputEl.setCustomValidity(text);
} else if (
inputEl.value &&
!inputEl.value.startsWith("https://") &&
!inputEl.value.startsWith("http://")
) {
this.updateFormState(
{
urlList: "https://" + inputEl.value,
},
true,
);
}
}}
>
Expand All @@ -834,19 +846,8 @@ https://archiveweb.page/guide`}
required
@keyup=${async (e: KeyboardEvent) => {
if (e.key === "Enter") {
const inputEl = e.target as SlInput;
await inputEl.updateComplete;
if (!inputEl.value) return;
const { isValid, helpText } = this.validateUrlList(
inputEl.value,
MAX_ADDITIONAL_URLS,
);
inputEl.helpText = helpText;
if (isValid) {
inputEl.setCustomValidity("");
} else {
inputEl.setCustomValidity(helpText);
}
await (e.target as SlInput).updateComplete;
this.doValidateTextArea(e.target);
}
}}
@sl-input=${(e: CustomEvent) => {
Expand All @@ -856,24 +857,16 @@ https://archiveweb.page/guide`}
}
}}
@sl-change=${async (e: CustomEvent) => {
const inputEl = e.target as SlInput;
if (!inputEl.value) return;
const { isValid, helpText } = this.validateUrlList(
inputEl.value,
MAX_ADDITIONAL_URLS,
);
inputEl.helpText = helpText;
if (isValid) {
inputEl.setCustomValidity("");
} else {
inputEl.setCustomValidity(helpText);
}
this.doValidateTextArea(e.target);
}}
@sl-blur=${async (e: CustomEvent) => {
this.doValidateTextArea(e.target);
}}
></sl-textarea>
`)}
${this.renderHelpTextCol(
msg(
str`The crawler will visit and record each URL listed here. You can enter up to ${MAX_ADDITIONAL_URLS.toLocaleString()} URLs.`,
str`The crawler will visit and record each URL listed here. You can enter up to ${URL_LIST_MAX_URLS.toLocaleString()} URLs.`,
),
)}
`}
Expand Down Expand Up @@ -996,6 +989,17 @@ https://archiveweb.page/guide`}
const text = msg("Please enter a valid URL.");
inputEl.helpText = text;
inputEl.setCustomValidity(text);
} else if (
inputEl.value &&
!inputEl.value.startsWith("https://") &&
!inputEl.value.startsWith("http://")
) {
this.updateFormState(
{
primarySeedUrl: "https://" + inputEl.value,
},
true,
);
}
}}
>
Expand Down Expand Up @@ -1098,19 +1102,8 @@ https://example.net`}
https://archiveweb.page/images/${"logo.svg"}`}
@keyup=${async (e: KeyboardEvent) => {
if (e.key === "Enter") {
const inputEl = e.target as SlInput;
await inputEl.updateComplete;
if (!inputEl.value) return;
const { isValid, helpText } = this.validateUrlList(
inputEl.value,
MAX_ADDITIONAL_URLS,
);
inputEl.helpText = helpText;
if (isValid) {
inputEl.setCustomValidity("");
} else {
inputEl.setCustomValidity(helpText);
}
await (e.target as SlInput).updateComplete;
this.doValidateTextArea(e.target);
}
}}
@sl-input=${(e: CustomEvent) => {
Expand All @@ -1120,24 +1113,16 @@ https://archiveweb.page/images/${"logo.svg"}`}
}
}}
@sl-change=${async (e: CustomEvent) => {
const inputEl = e.target as SlInput;
if (!inputEl.value) return;
const { isValid, helpText } = this.validateUrlList(
inputEl.value,
MAX_ADDITIONAL_URLS,
);
inputEl.helpText = helpText;
if (isValid) {
inputEl.setCustomValidity("");
} else {
inputEl.setCustomValidity(helpText);
}
this.doValidateTextArea(e.target);
}}
@sl-blur=${async (e: CustomEvent) => {
this.doValidateTextArea(e.target);
}}
></sl-textarea>
`)}
${this.renderHelpTextCol(
msg(
str`The crawler will visit and record each URL listed here. You can enter up to ${MAX_ADDITIONAL_URLS.toLocaleString()} URLs.`,
str`The crawler will visit and record each URL listed here. You can enter up to ${URL_LIST_MAX_URLS.toLocaleString()} URLs.`,
),
)}
</div>
Expand All @@ -1146,6 +1131,21 @@ https://archiveweb.page/images/${"logo.svg"}`}
`;
};

private doValidateTextArea(target: EventTarget | null) {
const inputEl = target as SlInput;
if (!inputEl.value) return;
const { isValid, helpText } = this.validateUrlList(
inputEl.value,
URL_LIST_MAX_URLS,
);
inputEl.helpText = helpText;
if (isValid) {
inputEl.setCustomValidity("");
} else {
inputEl.setCustomValidity(helpText);
}
}

private renderCrawlLimits() {
// Max Pages minimum value cannot be lower than seed count
const minPages = Math.max(
Expand Down Expand Up @@ -2075,6 +2075,20 @@ https://archiveweb.page/images/${"logo.svg"}`}
str`Please remove or fix the following invalid URL: ${invalidUrl}`,
);
}
if (isValid) {
// auto-add https:// prefix if otherwise a valid URL
let updated = false;
for (let i = 0; i < urlList.length; i++) {
const url = urlList[i];
if (!url.startsWith("http://") && !url.startsWith("https://")) {
urlList[i] = "https://" + url;
updated = true;
}
}
if (updated) {
this.updateFormState({ urlList: urlList.join("\n") });
}
}
}
return { isValid, helpText };
}
Expand Down
2 changes: 1 addition & 1 deletion frontend/xliff/es.xlf
Original file line number Diff line number Diff line change
Expand Up @@ -3688,7 +3688,7 @@
<source>The URL of the page to crawl.</source>
</trans-unit>
<trans-unit id="s41d2278219615589">
<source>The crawler will visit and record each URL listed here. You can enter up to <x equiv-text="${MAX_ADDITIONAL_URLS.toLocaleString()}" id="0"/> URLs.</source>
<source>The crawler will visit and record each URL listed here. You can enter up to <x equiv-text="${URL_LIST_MAX_URLS.toLocaleString()}" id="0"/> URLs.</source>
</trans-unit>
<trans-unit id="sfc5e402f8b21ef5f">
<source>If checked, the crawler will visit pages one link away.</source>
Expand Down

0 comments on commit ca012a4

Please sign in to comment.